Example usage for org.apache.lucene.analysis Analyzer getPositionIncrementGap

List of usage examples for org.apache.lucene.analysis Analyzer getPositionIncrementGap

Introduction

In this page you can find the example usage for org.apache.lucene.analysis Analyzer getPositionIncrementGap.

Prototype

public int getPositionIncrementGap(String fieldName) 

Source Link

Document

Invoked before indexing a IndexableField instance if terms have already been added to that field.

Usage

From source file:com.liferay.portal.search.lucene.PerFieldAnalyzerWrapper.java

License:Open Source License

@Override
public int getPositionIncrementGap(String fieldName) {
    Analyzer analyzer = _getAnalyzer(fieldName);

    return analyzer.getPositionIncrementGap(fieldName);
}

From source file:lucandra.IndexWriter.java

License:Apache License

@SuppressWarnings("unchecked")
public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {

    List<String> allIndexedTerms = new ArrayList<String>();

    // check for special field name
    String docId = doc.get(CassandraUtils.documentIdField);

    if (docId == null)
        docId = Long.toHexString((long) (System.nanoTime() + (Math.random() * System.nanoTime())));

    int position = 0;

    for (Fieldable field : (List<Fieldable>) doc.getFields()) {

        // Untokenized fields go in without a termPosition

        if (field.isIndexed() && !field.isTokenized()) {

            String term = CassandraUtils.createColumnName(field.name(), field.stringValue());

            allIndexedTerms.add(term);//  w  ww.ja  va 2s  .  c  om

            String key = indexName + CassandraUtils.delimeter + term;

            Map<String, List<Number>> termMap = new HashMap<String, List<Number>>();

            termMap.put(CassandraUtils.termFrequencyKey, CassandraUtils.emptyArray);
            termMap.put(CassandraUtils.positionVectorKey, CassandraUtils.emptyArray);

            CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily,
                    docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, termMap);

        } else if (field.isIndexed()) {

            TokenStream tokens = field.tokenStreamValue();

            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }

            // collect term information per field
            Map<String, Map<String, List<Number>>> allTermInformation = new HashMap<String, Map<String, List<Number>>>();

            int lastOffset = 0;
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }

            // Build the termPositions vector for all terms

            tokens.reset(); // reset the TokenStream to the first token

            // set up token attributes we are working on

            // offsets
            OffsetAttribute offsetAttribute = null;
            if (field.isStoreOffsetWithTermVector())
                offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);

            // positions
            PositionIncrementAttribute posIncrAttribute = null;
            if (field.isStorePositionWithTermVector())
                posIncrAttribute = (PositionIncrementAttribute) tokens
                        .addAttribute(PositionIncrementAttribute.class);

            TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);

            // store normalizations of field per term per document rather
            // than per field.
            // this adds more to write but less to read on other side
            Integer tokensInField = new Integer(0);

            while (tokens.incrementToken()) {
                tokensInField++;
                String term = CassandraUtils.createColumnName(field.name(), termAttribute.term());

                allIndexedTerms.add(term);

                // fetch all collected information for this term
                Map<String, List<Number>> termInfo = allTermInformation.get(term);

                if (termInfo == null) {
                    termInfo = new HashMap<String, List<Number>>();
                    allTermInformation.put(term, termInfo);
                }

                // term frequency
                {
                    List<Number> termFrequency = termInfo.get(CassandraUtils.termFrequencyKey);

                    if (termFrequency == null) {
                        termFrequency = new ArrayList<Number>();
                        termFrequency.add(new Integer(0));
                        termInfo.put(CassandraUtils.termFrequencyKey, termFrequency);
                    }

                    // increment
                    termFrequency.set(0, termFrequency.get(0).intValue() + 1);
                }

                // position vector
                if (field.isStorePositionWithTermVector()) {
                    position += (posIncrAttribute.getPositionIncrement() - 1);

                    List<Number> positionVector = termInfo.get(CassandraUtils.positionVectorKey);

                    if (positionVector == null) {
                        positionVector = new ArrayList<Number>();
                        termInfo.put(CassandraUtils.positionVectorKey, positionVector);
                    }

                    positionVector.add(++position);
                }

                // term offsets
                if (field.isStoreOffsetWithTermVector()) {

                    List<Number> offsetVector = termInfo.get(CassandraUtils.offsetVectorKey);
                    if (offsetVector == null) {
                        offsetVector = new ArrayList<Number>();
                        termInfo.put(CassandraUtils.offsetVectorKey, offsetVector);
                    }

                    offsetVector.add(lastOffset + offsetAttribute.startOffset());
                    offsetVector.add(lastOffset + offsetAttribute.endOffset());

                }
            }

            List<Number> bnorm = null;
            if (!field.getOmitNorms()) {
                bnorm = new ArrayList<Number>();
                float norm = doc.getBoost();
                norm *= field.getBoost();
                norm *= similarity.lengthNorm(field.name(), tokensInField);
                bnorm.add(Similarity.encodeNorm(norm));
            }

            for (Map.Entry<String, Map<String, List<Number>>> term : allTermInformation.entrySet()) {

                // Terms are stored within a unique key combination
                // This is required since cassandra loads all columns
                // in a key/column family into memory
                String key = indexName + CassandraUtils.delimeter + term.getKey();

                // Mix in the norm for this field alongside each term
                // more writes but faster on read side.
                if (!field.getOmitNorms()) {
                    term.getValue().put(CassandraUtils.normsKey, bnorm);
                }

                CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily,
                        docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, term.getValue());
            }
        }

        // Stores each field as a column under this doc key
        if (field.isStored()) {

            byte[] _value = field.isBinary() ? field.getBinaryValue() : field.stringValue().getBytes("UTF-8");

            // first byte flags if binary or not
            byte[] value = new byte[_value.length + 1];
            System.arraycopy(_value, 0, value, 0, _value.length);

            value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);

            String key = indexName + CassandraUtils.delimeter + docId;

            CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily,
                    field.name().getBytes("UTF-8"), CassandraUtils.hashKey(key), value, null);

        }
    }

    // Finally, Store meta-data so we can delete this document
    String key = indexName + CassandraUtils.delimeter + docId;

    CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily,
            CassandraUtils.documentMetaField.getBytes("UTF-8"), CassandraUtils.hashKey(key),
            CassandraUtils.toBytes(allIndexedTerms), null);

    if (autoCommit)
        CassandraUtils.robustBatchInsert(client, getMutationMap());
}

From source file:org.hbasene.index.HBaseIndexWriter.java

License:Apache License

public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException {
    String docId = doc.get(this.primaryKeyField);
    if (docId == null) {
        throw new IllegalArgumentException(
                "Primary Key " + this.primaryKeyField + " not present in the document to be added ");
        // TODO: Special type of exception needed ?

    }//www  .jav  a2 s  . c  om
    int position = 0;
    Map<String, List<Integer>> termPositions = new HashMap<String, List<Integer>>();
    Map<String, byte[]> fieldsToStore = new HashMap<String, byte[]>();

    for (Fieldable field : doc.getFields()) {

        // Indexed field
        if (field.isIndexed() && field.isTokenized()) {
            TokenStream tokens = field.tokenStreamValue();

            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }
            tokens.addAttribute(TermAttribute.class);
            tokens.addAttribute(PositionIncrementAttribute.class);

            // collect term frequencies per doc
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }

            // Build the termPositions vector for all terms
            while (tokens.incrementToken()) {
                String term = createColumnName(field.name(), tokens.getAttribute(TermAttribute.class).term());

                List<Integer> pvec = termPositions.get(term);

                if (pvec == null) {
                    pvec = Lists.newArrayList();
                    termPositions.put(term, pvec);
                }

                position += (tokens.getAttribute(PositionIncrementAttribute.class).getPositionIncrement() - 1);
                pvec.add(++position);

            }
            tokens.close();

        }

        // Untokenized fields go in without a termPosition
        if (field.isIndexed() && !field.isTokenized()) {
            String term = this.createColumnName(field.name(), field.stringValue());
            String key = term;
            termPositions.put(key, EMPTY_TERM_POSITIONS);

        }

        // Stores each field as a column under this doc key
        if (field.isStored()) {

            byte[] value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());

            // first byte flags if binary or not
            final byte[] prefix = Bytes.toBytes((field.isBinary() ? 'B' : 'T'));

            fieldsToStore.put(field.name(), Bytes.add(prefix, value));
        }
    }
    indexStore.indexDocument(docId, new DocumentIndexContext(termPositions, fieldsToStore));
    termPositions.clear();
    fieldsToStore.clear();
}

From source file:org.solbase.lucenehbase.IndexWriter.java

License:Apache License

@SuppressWarnings("unchecked")
public ParsedDoc parseDoc(Document doc, Analyzer analyzer, String indexName, int docNumber,
        List<String> sortFieldNames) throws CorruptIndexException, IOException {
    // given doc, what are all of terms we indexed
    List<Term> allIndexedTerms = new ArrayList<Term>();
    Map<String, byte[]> fieldCache = new HashMap<String, byte[]>(1024);

    // need to hold onto TermDocMetaData, so it can return this array
    List<TermDocMetadata> metadatas = new ArrayList<TermDocMetadata>();

    byte[] docId = Bytes.toBytes(docNumber);
    int position = 0;

    for (Fieldable field : (List<Fieldable>) doc.getFields()) {
        // Indexed field
        if (field.isIndexed() && field.isTokenized()) {

            TokenStream tokens = field.tokenStreamValue();

            if (tokens == null) {
                tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue()));
            }//w  ww  .jav a2s  .  c o m

            // collect term information per field
            Map<Term, Map<ByteBuffer, List<Number>>> allTermInformation = new ConcurrentSkipListMap<Term, Map<ByteBuffer, List<Number>>>();

            int lastOffset = 0;
            if (position > 0) {
                position += analyzer.getPositionIncrementGap(field.name());
            }

            tokens.reset(); // reset the TokenStream to the first token

            // offsets
            OffsetAttribute offsetAttribute = null;
            if (field.isStoreOffsetWithTermVector())
                offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class);

            // positions
            PositionIncrementAttribute posIncrAttribute = null;
            if (field.isStorePositionWithTermVector())
                posIncrAttribute = (PositionIncrementAttribute) tokens
                        .addAttribute(PositionIncrementAttribute.class);

            TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class);

            // store normalizations of field per term per document
            // rather
            // than per field.
            // this adds more to write but less to read on other side
            Integer tokensInField = new Integer(0);

            while (tokens.incrementToken()) {
                tokensInField++;
                Term term = new Term(field.name(), termAttribute.term());

                allIndexedTerms.add(term);

                // fetch all collected information for this term
                Map<ByteBuffer, List<Number>> termInfo = allTermInformation.get(term);

                if (termInfo == null) {
                    termInfo = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
                    allTermInformation.put(term, termInfo);
                }

                // term frequency
                List<Number> termFrequency = termInfo.get(TermDocMetadata.termFrequencyKeyBytes);
                if (termFrequency == null) {
                    termFrequency = new ArrayList<Number>();
                    termFrequency.add(new Integer(0));
                    termInfo.put(TermDocMetadata.termFrequencyKeyBytes, termFrequency);
                }
                // increment
                termFrequency.set(0, termFrequency.get(0).intValue() + 1);

                // position vector
                if (field.isStorePositionWithTermVector()) {
                    position += (posIncrAttribute.getPositionIncrement() - 1);

                    List<Number> positionVector = termInfo.get(TermDocMetadata.positionVectorKeyBytes);

                    if (positionVector == null) {
                        positionVector = new ArrayList<Number>();
                        termInfo.put(TermDocMetadata.positionVectorKeyBytes, positionVector);
                    }

                    positionVector.add(++position);
                }

                // term offsets
                if (field.isStoreOffsetWithTermVector()) {

                    List<Number> offsetVector = termInfo.get(TermDocMetadata.offsetVectorKeyBytes);
                    if (offsetVector == null) {
                        offsetVector = new ArrayList<Number>();
                        termInfo.put(TermDocMetadata.offsetVectorKeyBytes, offsetVector);
                    }

                    offsetVector.add(lastOffset + offsetAttribute.startOffset());
                    offsetVector.add(lastOffset + offsetAttribute.endOffset());

                }

                List<Number> sortValues = new ArrayList<Number>();
                // init sortValues
                for (int i = 0; i < Scorer.numSort; i++) {
                    sortValues.add(new Integer(-1));
                }

                int order = 0;

                // extract sort field value and store it in term doc metadata obj
                for (String fieldName : sortFieldNames) {
                    Fieldable fieldable = doc.getFieldable(fieldName);

                    if (fieldable instanceof EmbeddedSortField) {
                        EmbeddedSortField sortField = (EmbeddedSortField) fieldable;

                        int value = -1;
                        if (sortField.stringValue() != null) {
                            value = Integer.parseInt(sortField.stringValue());
                        }
                        int sortSlot = sortField.getSortSlot();

                        sortValues.set(sortSlot - 1, new Integer(value));
                    } else {
                        // TODO: this logic is used for real time indexing.
                        // hacky. depending on order of sort field names in array
                        int value = -1;
                        if (fieldable.stringValue() != null) {
                            value = Integer.parseInt(fieldable.stringValue());
                        }
                        sortValues.set(order++, new Integer(value));
                    }
                }
                termInfo.put(TermDocMetadata.sortFieldKeyBytes, sortValues);
            }

            List<Number> bnorm = null;
            if (!field.getOmitNorms()) {
                bnorm = new ArrayList<Number>();
                float norm = doc.getBoost();
                norm *= field.getBoost();
                norm *= similarity.lengthNorm(field.name(), tokensInField);
                bnorm.add(Similarity.encodeNorm(norm));
            }

            for (Map.Entry<Term, Map<ByteBuffer, List<Number>>> term : allTermInformation.entrySet()) {
                Term tempTerm = term.getKey();

                byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(tempTerm);

                // Mix in the norm for this field alongside each term
                // more writes but faster on read side.
                if (!field.getOmitNorms()) {
                    term.getValue().put(TermDocMetadata.normsKeyBytes, bnorm);
                }

                TermDocMetadata data = new TermDocMetadata(docNumber, term.getValue(), fieldTermKeyBytes,
                        tempTerm);
                metadatas.add(data);
            }
        }

        // Untokenized fields go in without a termPosition
        if (field.isIndexed() && !field.isTokenized()) {
            Term term = new Term(field.name(), field.stringValue());
            allIndexedTerms.add(term);

            byte[] fieldTermKeyBytes = SolbaseUtil.generateTermKey(term);

            Map<ByteBuffer, List<Number>> termMap = new ConcurrentSkipListMap<ByteBuffer, List<Number>>();
            termMap.put(TermDocMetadata.termFrequencyKeyBytes, Arrays.asList(new Number[] {}));
            termMap.put(TermDocMetadata.positionVectorKeyBytes, Arrays.asList(new Number[] {}));

            TermDocMetadata data = new TermDocMetadata(docNumber, termMap, fieldTermKeyBytes, term);
            metadatas.add(data);
        }

        // Stores each field as a column under this doc key
        if (field.isStored()) {

            byte[] _value = field.isBinary() ? field.getBinaryValue() : Bytes.toBytes(field.stringValue());

            // first byte flags if binary or not
            byte[] value = new byte[_value.length + 1];
            System.arraycopy(_value, 0, value, 0, _value.length);

            value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE);

            // logic to handle multiple fields w/ same name
            byte[] currentValue = fieldCache.get(field.name());
            if (currentValue == null) {
                fieldCache.put(field.name(), value);
            } else {

                // append new data
                byte[] newValue = new byte[currentValue.length + SolbaseUtil.delimiter.length + value.length
                        - 1];
                System.arraycopy(currentValue, 0, newValue, 0, currentValue.length - 1);
                System.arraycopy(SolbaseUtil.delimiter, 0, newValue, currentValue.length - 1,
                        SolbaseUtil.delimiter.length);
                System.arraycopy(value, 0, newValue, currentValue.length + SolbaseUtil.delimiter.length - 1,
                        value.length);

                fieldCache.put(field.name(), newValue);
            }
        }
    }

    Put documentPut = new Put(SolbaseUtil.randomize(docNumber));

    // Store each field as a column under this docId
    for (Map.Entry<String, byte[]> field : fieldCache.entrySet()) {
        documentPut.add(Bytes.toBytes("field"), Bytes.toBytes(field.getKey()), field.getValue());
    }

    // in case of real time update, we need to add back docId field
    if (!documentPut.has(Bytes.toBytes("field"), Bytes.toBytes("docId"))) {

        byte[] docIdStr = Bytes.toBytes(new Integer(docNumber).toString());
        // first byte flags if binary or not
        byte[] value = new byte[docIdStr.length + 1];
        System.arraycopy(docIdStr, 0, value, 0, docIdStr.length);

        value[value.length - 1] = (byte) (Byte.MIN_VALUE);
        documentPut.add(Bytes.toBytes("field"), Bytes.toBytes("docId"), value);
    }

    // Finally, Store meta-data so we can delete this document
    documentPut.add(Bytes.toBytes("allTerms"), Bytes.toBytes("allTerms"),
            SolbaseUtil.toBytes(allIndexedTerms).array());

    ParsedDoc parsedDoc = new ParsedDoc(metadatas, doc, documentPut, fieldCache.entrySet(), allIndexedTerms);
    return parsedDoc;

}