Example usage for org.apache.lucene.index Fields terms

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Get the Terms for this field.

Usage

From source file:VocabDumper.java

void printWords() throws IOException {
    IndexReader indexReader = DirectoryReader.open(FSDirectory.open(indexDir));
    Fields fields = MultiFields.getFields(indexReader);

    String[] fieldNames = { MSIRDoc.FIELD_TITLE_EN, MSIRDoc.FIELD_EN, MSIRDoc.FIELD_TITLE_HN,
            MSIRDoc.FIELD_HN };/*w  w  w.  j  a v  a2  s  .  c o m*/

    for (String fieldName : fieldNames) {
        Terms terms = fields.terms(fieldName);
        TermsEnum iterator = terms.iterator(null);
        BytesRef byteRef = null;
        while ((byteRef = iterator.next()) != null) {
            String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
            if (term.indexOf('#') == -1)
                System.out.println(term);
        }
    }
}

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected Map<String, int[]> parseD1Index(IndexReader d1Index, IndexReader d2Index) {
    try {//  ww  w. ja  va  2s.c  o m
        int[] documentIds = getDocumentIds(d1Index);
        final Map<String, int[]> hashedBlocks = new HashMap<>();
        Fields fields = MultiFields.getFields(d1Index);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                // check whether it is a common term
                int d2DocFrequency = d2Index.docFreq(new Term(field, text));
                if (d2DocFrequency == 0) {
                    continue;
                }

                final List<Integer> entityIds = new ArrayList<>();
                PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text);
                int doc;
                while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    entityIds.add(documentIds[doc]);
                }

                int[] idsArray = Converter.convertCollectionToArray(entityIds);
                hashedBlocks.put(text.utf8ToString(), idsArray);
            }
        }
        return hashedBlocks;
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
        return null;
    }
}

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected void parseD2Index(IndexReader d2Index, Map<String, int[]> hashedBlocks) {
    try {//from   ww  w . ja va2  s  .c o m
        int[] documentIds = getDocumentIds(d2Index);
        Fields fields = MultiFields.getFields(d2Index);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                if (!hashedBlocks.containsKey(text.utf8ToString())) {
                    continue;
                }

                final List<Integer> entityIds = new ArrayList<>();
                PostingsEnum pe = MultiFields.getTermDocsEnum(d2Index, field, text);
                int doc;
                while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    entityIds.add(documentIds[doc]);
                }

                int[] idsArray = Converter.convertCollectionToArray(entityIds);
                int[] d1Entities = hashedBlocks.get(text.utf8ToString());
                blocks.add(new BilateralBlock(d1Entities, idsArray));
            }
        }

    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
    }
}

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected void parseIndex(IndexReader d1Index) {
    try {//from  ww w .  j  av  a 2  s.  c  o  m
        int[] documentIds = getDocumentIds(d1Index);
        Fields fields = MultiFields.getFields(d1Index);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                if (termsEnum.docFreq() < 2) {
                    continue;
                }

                final List<Integer> entityIds = new ArrayList<>();
                PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text);
                int doc;
                while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    entityIds.add(documentIds[doc]);
                }

                int[] idsArray = Converter.convertCollectionToArray(entityIds);
                UnilateralBlock block = new UnilateralBlock(idsArray);
                blocks.add(block);
            }
        }
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
    }
}

From source file:BlockBuilding.AbstractSortedNeighborhoodBlocking.java

License:Open Source License

protected Set<String> getTerms(IndexReader iReader) {
    Set<String> sortedTerms = new HashSet<>();
    try {// w ww .  j a va2s. c om
        Fields fields = MultiFields.getFields(iReader);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator(null);
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                sortedTerms.add(text.utf8ToString());
            }
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
    return sortedTerms;
}

From source file:BlockBuilding.SortedNeighborhoodBlocking.java

License:Apache License

protected Set<String> getTerms(IndexReader iReader) {
    Set<String> sortedTerms = new HashSet<>();
    try {//from  w  ww.ja  va  2 s . co  m
        Fields fields = MultiFields.getFields(iReader);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                sortedTerms.add(text.utf8ToString());
            }
        }
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
    }
    return sortedTerms;
}

From source file:com.core.nlp.index.IndexReader.java

License:Apache License

/** Retrieve term vector for this document and field, or
 *  null if term vectors were not indexed.  The returned
 *  Fields instance acts like a single-document inverted
 *  index (the docID will be 0). */
public final Terms getTermVector(int docID, String field) throws IOException {
    Fields vectors = getTermVectors(docID);
    if (vectors == null) {
        return null;
    }/*from w w w  . j  a v  a2s .c  o m*/
    return vectors.terms(field);
}

From source file:com.core.nlp.query.MoreLikeThis.java

License:Apache License

/**
 * Find words for a more-like-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 *//*from   w w w.j  a  v  a2 s.c  o m*/
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
    Map<String, Int> termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
        final Fields vectors = ir.getTermVectors(docNum);
        final Terms vector;
        if (vectors != null) {
            vector = vectors.terms(fieldName);
        } else {
            vector = null;
        }

        // field does not store term vector info
        if (vector == null) {
            Document d = ir.document(docNum);
            IndexableField[] fields = d.getFields(fieldName);
            for (IndexableField field : fields) {
                final String stringValue = field.stringValue();
                if (stringValue != null) {
                    addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName);
                }
            }
        } else {
            addTermFrequencies(termFreqMap, vector);
        }
    }

    return createQueue(termFreqMap);
}

From source file:com.factweavers.elasticsearch.payloadscorefunction.PayloadScoringFunction.java

License:Apache License

@Override
public double score(int docId, float subQueryScore) {
    indexLookup.setNextDocId(docId);//from ww w . ja v  a 2s.  c o m
    float score = 0;
    int obtainedTerms = 0;
    try {
        Fields termVectors = indexLookup.termVectors();
        Boolean isPayloadOrIndex = false;
        TermsEnum iterator = null;
        if (termVectors != null && termVectors.terms(field) != null && termVectors.terms(field).hasPayloads()) {
            isPayloadOrIndex = true;
            Terms fields = termVectors.terms(field);
            iterator = fields.iterator(null);
        }

        if (isPayloadOrIndex) {
            BytesRef firstElement = iterator.next();
            while (firstElement != null && (obtainedTerms < values.size())) {
                String currentValue = firstElement.utf8ToString();
                if (!values.contains(currentValue)) {
                    //logger.info("Payload Skipping " + currentValue);
                    firstElement = iterator.next();
                    continue;
                } else {
                    obtainedTerms++;
                }
                //logger.info("Payload processing value is " + currentValue);
                DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
                docsAndPositions.nextDoc();
                docsAndPositions.nextPosition();
                BytesRef payload = docsAndPositions.getPayload();
                if (payload != null) {
                    score += PayloadHelper.decodeFloat(payload.bytes, payload.offset);
                    //logger.info("Score " + score);
                } else {
                    score += defaultValue;
                }
                firstElement = iterator.next();
            }
        } else {
            IndexField fieldObject = indexLookup.get(field);
            for (String value : values) {
                IndexFieldTerm tokens = fieldObject.get(value,
                        IndexLookup.FLAG_CACHE | IndexLookup.FLAG_PAYLOADS);
                if (fieldObject != null && tokens != null) {
                    //logger.info("Processing docID=" + docId + " " + field
                    //      + " for " + value + " , " + tokens);
                    if (tokens.iterator().hasNext()) {
                        score += tokens.iterator().next().payloadAsFloat(defaultValue);
                    }

                }
            }
        }
    } catch (IOException e) {
        //logger.info("Exception in Term Vectors");
        e.printStackTrace();
    }
    return new Double(score);
}

From source file:com.github.flaxsearch.resources.TermsResource.java

License:Apache License

@GET
public TermsData getTerms(@QueryParam("segment") Integer segment, @PathParam("field") String field,
        @QueryParam("from") String startTerm, @QueryParam("filter") String filter,
        @QueryParam("encoding") @DefaultValue("utf8") String encoding,
        @QueryParam("count") @DefaultValue("50") int count) throws IOException {

    try {/*from  w  ww .j  a v a  2 s  . com*/
        Fields fields = readerManager.getFields(segment);
        Terms terms = fields.terms(field);

        if (terms == null)
            throw new WebApplicationException("No such field " + field, Response.Status.NOT_FOUND);

        TermsEnum te = getTermsEnum(terms, filter);
        List<String> collected = new ArrayList<>();

        if (startTerm != null) {
            BytesRef start = BytesRefUtils.decode(startTerm, encoding);
            if (te.seekCeil(start) == TermsEnum.SeekStatus.END)
                return new TermsData(terms, Collections.emptyList(), encoding);
        } else {
            if (te.next() == null) {
                return new TermsData(terms, Collections.emptyList(), encoding);
            }
        }

        do {
            collected.add(BytesRefUtils.encode(te.term(), encoding));
        } while (te.next() != null && --count > 0);

        return new TermsData(terms, collected, encoding);
    } catch (NumberFormatException e) {
        throw new WebApplicationException("Field " + field + " cannot be decoded as " + encoding,
                Response.Status.BAD_REQUEST);
    }
}