Example usage for org.apache.lucene.index Fields terms

List of usage examples for org.apache.lucene.index Fields terms

Introduction

In this page you can find the example usage for org.apache.lucene.index Fields terms.

Prototype

public abstract Terms terms(String field) throws IOException;

Source Link

Document

Get the Terms for this field.

Usage

From source file:VocabDumper.java

void printWords() throws IOException {
    IndexReader indexReader = DirectoryReader.open(FSDirectory.open(indexDir));
    Fields fields = MultiFields.getFields(indexReader);

    String[] fieldNames = { MSIRDoc.FIELD_TITLE_EN, MSIRDoc.FIELD_EN, MSIRDoc.FIELD_TITLE_HN,
            MSIRDoc.FIELD_HN };/*w  w  w.  j  a v  a2  s  .  c o m*/

    for (String fieldName : fieldNames) {
        Terms terms = fields.terms(fieldName);
        TermsEnum iterator = terms.iterator(null);
        BytesRef byteRef = null;
        while ((byteRef = iterator.next()) != null) {
            String term = new String(byteRef.bytes, byteRef.offset, byteRef.length);
            if (term.indexOf('#') == -1)
                System.out.println(term);
        }
    }
}

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected Map<String, int[]> parseD1Index(IndexReader d1Index, IndexReader d2Index) {
    try {//  ww  w. ja  va  2s.c  o m
        int[] documentIds = getDocumentIds(d1Index);
        final Map<String, int[]> hashedBlocks = new HashMap<>();
        Fields fields = MultiFields.getFields(d1Index);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                // check whether it is a common term
                int d2DocFrequency = d2Index.docFreq(new Term(field, text));
                if (d2DocFrequency == 0) {
                    continue;
                }

                final List<Integer> entityIds = new ArrayList<>();
                PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text);
                int doc;
                while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    entityIds.add(documentIds[doc]);
                }

                int[] idsArray = Converter.convertCollectionToArray(entityIds);
                hashedBlocks.put(text.utf8ToString(), idsArray);
            }
        }
        return hashedBlocks;
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
        return null;
    }
}

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected void parseD2Index(IndexReader d2Index, Map<String, int[]> hashedBlocks) {
    try {//from   ww  w . ja va2  s  .c o m
        int[] documentIds = getDocumentIds(d2Index);
        Fields fields = MultiFields.getFields(d2Index);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                if (!hashedBlocks.containsKey(text.utf8ToString())) {
                    continue;
                }

                final List<Integer> entityIds = new ArrayList<>();
                PostingsEnum pe = MultiFields.getTermDocsEnum(d2Index, field, text);
                int doc;
                while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    entityIds.add(documentIds[doc]);
                }

                int[] idsArray = Converter.convertCollectionToArray(entityIds);
                int[] d1Entities = hashedBlocks.get(text.utf8ToString());
                blocks.add(new BilateralBlock(d1Entities, idsArray));
            }
        }

    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
    }
}

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected void parseIndex(IndexReader d1Index) {
    try {//from  ww w .  j  av  a 2  s.  c  o  m
        int[] documentIds = getDocumentIds(d1Index);
        Fields fields = MultiFields.getFields(d1Index);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                if (termsEnum.docFreq() < 2) {
                    continue;
                }

                final List<Integer> entityIds = new ArrayList<>();
                PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text);
                int doc;
                while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    entityIds.add(documentIds[doc]);
                }

                int[] idsArray = Converter.convertCollectionToArray(entityIds);
                UnilateralBlock block = new UnilateralBlock(idsArray);
                blocks.add(block);
            }
        }
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
    }
}

From source file:BlockBuilding.AbstractSortedNeighborhoodBlocking.java

License:Open Source License

protected Set<String> getTerms(IndexReader iReader) {
    Set<String> sortedTerms = new HashSet<>();
    try {// w ww .  j a va2s. c om
        Fields fields = MultiFields.getFields(iReader);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator(null);
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                sortedTerms.add(text.utf8ToString());
            }
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
    return sortedTerms;
}

From source file:BlockBuilding.SortedNeighborhoodBlocking.java

License:Apache License

protected Set<String> getTerms(IndexReader iReader) {
    Set<String> sortedTerms = new HashSet<>();
    try {//from  w  ww.ja  va  2 s . co  m
        Fields fields = MultiFields.getFields(iReader);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                sortedTerms.add(text.utf8ToString());
            }
        }
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
    }
    return sortedTerms;
}

From source file:com.core.nlp.index.IndexReader.java

License:Apache License

/** Retrieve term vector for this document and field, or
 *  null if term vectors were not indexed.  The returned
 *  Fields instance acts like a single-document inverted
 *  index (the docID will be 0). */
public final Terms getTermVector(int docID, String field) throws IOException {
    Fields vectors = getTermVectors(docID);
    if (vectors == null) {
        return null;
    }/*from w w w  . j  a v  a2s .c  o m*/
    return vectors.terms(field);
}

From source file:com.core.nlp.query.MoreLikeThis.java

License:Apache License

/**
 * Find words for a more-like-this query former.
 *
 * @param docNum the id of the lucene document from which to find terms
 *//*from   w w w.j  a  v  a2 s.c  o m*/
private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException {
    Map<String, Int> termFreqMap = new HashMap<>();
    for (String fieldName : fieldNames) {
        final Fields vectors = ir.getTermVectors(docNum);
        final Terms vector;
        if (vectors != null) {
            vector = vectors.terms(fieldName);
        } else {
            vector = null;
        }

        // field does not store term vector info
        if (vector == null) {
            Document d = ir.document(docNum);
            IndexableField[] fields = d.getFields(fieldName);
            for (IndexableField field : fields) {
                final String stringValue = field.stringValue();
                if (stringValue != null) {
                    addTermFrequencies(new StringReader(stringValue), termFreqMap, fieldName);
                }
            }
        } else {
            addTermFrequencies(termFreqMap, vector);
        }
    }

    return createQueue(termFreqMap);
}

From source file:com.factweavers.elasticsearch.payloadscorefunction.PayloadScoringFunction.java

License:Apache License

@Override
public double score(int docId, float subQueryScore) {
    indexLookup.setNextDocId(docId);//from ww w . ja v  a 2s.  c o m
    float score = 0;
    int obtainedTerms = 0;
    try {
        Fields termVectors = indexLookup.termVectors();
        Boolean isPayloadOrIndex = false;
        TermsEnum iterator = null;
        if (termVectors != null && termVectors.terms(field) != null && termVectors.terms(field).hasPayloads()) {
            isPayloadOrIndex = true;
            Terms fields = termVectors.terms(field);
            iterator = fields.iterator(null);
        }

        if (isPayloadOrIndex) {
            BytesRef firstElement = iterator.next();
            while (firstElement != null && (obtainedTerms < values.size())) {
                String currentValue = firstElement.utf8ToString();
                if (!values.contains(currentValue)) {
                    //logger.info("Payload Skipping " + currentValue);
                    firstElement = iterator.next();
                    continue;
                } else {
                    obtainedTerms++;
                }
                //logger.info("Payload processing value is " + currentValue);
                DocsAndPositionsEnum docsAndPositions = iterator.docsAndPositions(null, null);
                docsAndPositions.nextDoc();
                docsAndPositions.nextPosition();
                BytesRef payload = docsAndPositions.getPayload();
                if (payload != null) {
                    score += PayloadHelper.decodeFloat(payload.bytes, payload.offset);
                    //logger.info("Score " + score);
                } else {
                    score += defaultValue;
                }
                firstElement = iterator.next();
            }
        } else {
            IndexField fieldObject = indexLookup.get(field);
            for (String value : values) {
                IndexFieldTerm tokens = fieldObject.get(value,
                        IndexLookup.FLAG_CACHE | IndexLookup.FLAG_PAYLOADS);
                if (fieldObject != null && tokens != null) {
                    //logger.info("Processing docID=" + docId + " " + field
                    //      + " for " + value + " , " + tokens);
                    if (tokens.iterator().hasNext()) {
                        score += tokens.iterator().next().payloadAsFloat(defaultValue);
                    }

                }
            }
        }
    } catch (IOException e) {
        //logger.info("Exception in Term Vectors");
        e.printStackTrace();
    }
    return new Double(score);
}

From source file:com.github.flaxsearch.resources.TermsResource.java

License:Apache License

@GET
public TermsData getTerms(@QueryParam("segment") Integer segment, @PathParam("field") String field,
        @QueryParam("from") String startTerm, @QueryParam("filter") String filter,
        @QueryParam("encoding") @DefaultValue("utf8") String encoding,
        @QueryParam("count") @DefaultValue("50") int count) throws IOException {

    try {/*from  w  ww .j  a v a  2 s  . com*/
        Fields fields = readerManager.getFields(segment);
        Terms terms = fields.terms(field);

        if (terms == null)
            throw new WebApplicationException("No such field " + field, Response.Status.NOT_FOUND);

        TermsEnum te = getTermsEnum(terms, filter);
        List<String> collected = new ArrayList<>();

        if (startTerm != null) {
            BytesRef start = BytesRefUtils.decode(startTerm, encoding);
            if (te.seekCeil(start) == TermsEnum.SeekStatus.END)
                return new TermsData(terms, Collections.emptyList(), encoding);
        } else {
            if (te.next() == null) {
                return new TermsData(terms, Collections.emptyList(), encoding);
            }
        }

        do {
            collected.add(BytesRefUtils.encode(te.term(), encoding));
        } while (te.next() != null && --count > 0);

        return new TermsData(terms, collected, encoding);
    } catch (NumberFormatException e) {
        throw new WebApplicationException("Field " + field + " cannot be decoded as " + encoding,
                Response.Status.BAD_REQUEST);
    }
}