Example usage for org.apache.lucene.util BytesRef utf8ToString

List of usage examples for org.apache.lucene.util BytesRef utf8ToString

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef utf8ToString.

Prototype

public String utf8ToString() 

Source Link

Document

Interprets stored bytes as UTF8 bytes, returning the resulting string

Usage

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.NewsItemToTermsBolt.java

License:Apache License

private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap,
        String id, String field, double weight) throws IOException {
    Query query = new TermQuery(new Term("id", id));
    TopDocs topdocs = searcher.search(query, 1);

    if (topdocs.totalHits > 0) {
        int docNr = topdocs.scoreDocs[0].doc;
        Terms vector = reader.getTermVector(docNr, field);
        if (vector != null) {
            TermsEnum termsEnum;/*from   w w w. j a va  2s  .c  o m*/
            termsEnum = vector.iterator(TermsEnum.EMPTY);
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                String term = text.utf8ToString();
                int docFreq = reader.docFreq(new Term(field, text));
                // ignore really rare terms and really common terms
                double minFreq = reader.numDocs() * 0.0001;
                double maxFreq = reader.numDocs() / 3;
                //double minFreq = 0;
                //double maxFreq = Double.MAX_VALUE;

                if (docFreq > minFreq && docFreq < maxFreq) {
                    double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field);
                    double idf = Math.log((double) reader.numDocs() / docFreq);
                    if (!Double.isInfinite(idf)) {
                        if (!termMap.containsKey(term)) {
                            termMap.put(term, tf * idf * weight);
                        } else {
                            termMap.put(term, termMap.get(term) + tf * idf * weight);
                        }
                    }
                }
            }
        } else {
            logger.debug("no term available for doc=" + docNr + " and field=" + field);
        }
    } else {
        logger.warn("No documents found with id=" + id);
    }
}

From source file:be.ugent.tiwi.sleroux.newsrec.stormNewsFetch.storm.bolts.NewsItemToTermsBolt.java

License:Apache License

private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap, long id,
        String field, double weight) throws IOException {
    Query query = NumericRangeQuery.newLongRange("id", id, id, true, true);
    TopDocs topdocs = searcher.search(query, 1);

    if (topdocs.totalHits > 0) {
        int docNr = topdocs.scoreDocs[0].doc;
        Terms vector = reader.getTermVector(docNr, field);
        if (vector != null) {
            TermsEnum termsEnum;/*from   w  ww  .  j ava 2  s  .c  o m*/
            termsEnum = vector.iterator(TermsEnum.EMPTY);
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                String term = text.utf8ToString();
                int docFreq = reader.docFreq(new Term(field, text));
                // ignore really rare terms and really common terms
                //double minFreq = reader.numDocs() * 0.0001;
                //double maxFreq = reader.numDocs() / 3;
                double minFreq = 0;
                double maxFreq = Double.MAX_VALUE;

                if (docFreq > minFreq && docFreq < maxFreq) {
                    double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field);
                    double idf = Math.log((double) reader.numDocs() / docFreq);
                    if (!Double.isInfinite(idf)) {
                        if (!termMap.containsKey(term)) {
                            termMap.put(term, tf * idf * weight);
                        } else {
                            termMap.put(term, termMap.get(term) + tf * idf * weight);
                        }
                    }
                }
            }
        } else {
            logger.debug("no term available for doc=" + docNr + " and field=" + field);
        }
    } else {
        logger.warn("No documents found with id=" + id);
    }
}

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected Map<String, int[]> parseD1Index(IndexReader d1Index, IndexReader d2Index) {
    try {/*from   w  w w  .  j  av  a2s.  c o m*/
        int[] documentIds = getDocumentIds(d1Index);
        final Map<String, int[]> hashedBlocks = new HashMap<>();
        Fields fields = MultiFields.getFields(d1Index);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                // check whether it is a common term
                int d2DocFrequency = d2Index.docFreq(new Term(field, text));
                if (d2DocFrequency == 0) {
                    continue;
                }

                final List<Integer> entityIds = new ArrayList<>();
                PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text);
                int doc;
                while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    entityIds.add(documentIds[doc]);
                }

                int[] idsArray = Converter.convertCollectionToArray(entityIds);
                hashedBlocks.put(text.utf8ToString(), idsArray);
            }
        }
        return hashedBlocks;
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
        return null;
    }
}

From source file:BlockBuilding.AbstractBlockBuilding.java

License:Apache License

protected void parseD2Index(IndexReader d2Index, Map<String, int[]> hashedBlocks) {
    try {//from   w w  w  .j  a  va2s .  c om
        int[] documentIds = getDocumentIds(d2Index);
        Fields fields = MultiFields.getFields(d2Index);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                if (!hashedBlocks.containsKey(text.utf8ToString())) {
                    continue;
                }

                final List<Integer> entityIds = new ArrayList<>();
                PostingsEnum pe = MultiFields.getTermDocsEnum(d2Index, field, text);
                int doc;
                while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    entityIds.add(documentIds[doc]);
                }

                int[] idsArray = Converter.convertCollectionToArray(entityIds);
                int[] d1Entities = hashedBlocks.get(text.utf8ToString());
                blocks.add(new BilateralBlock(d1Entities, idsArray));
            }
        }

    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
    }
}

From source file:BlockBuilding.AbstractSortedNeighborhoodBlocking.java

License:Open Source License

protected Set<String> getTerms(IndexReader iReader) {
    Set<String> sortedTerms = new HashSet<>();
    try {/*from  w  w w  . j a va2 s .c om*/
        Fields fields = MultiFields.getFields(iReader);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator(null);
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                sortedTerms.add(text.utf8ToString());
            }
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
    return sortedTerms;
}

From source file:BlockBuilding.SortedNeighborhoodBlocking.java

License:Apache License

protected Set<String> getTerms(IndexReader iReader) {
    Set<String> sortedTerms = new HashSet<>();
    try {/*from   www  . ja v  a 2 s . c  om*/
        Fields fields = MultiFields.getFields(iReader);
        for (String field : fields) {
            Terms terms = fields.terms(field);
            TermsEnum termsEnum = terms.iterator();
            BytesRef text;
            while ((text = termsEnum.next()) != null) {
                sortedTerms.add(text.utf8ToString());
            }
        }
    } catch (IOException ex) {
        LOGGER.log(Level.SEVERE, null, ex);
    }
    return sortedTerms;
}

From source file:br.bireme.ngrams.Tools.java

public static void showTerms(final String indexName, final String fieldName) throws IOException {
    if (indexName == null) {
        throw new NullPointerException("indexName");
    }//from   w  ww.j  a  va  2s  . co  m
    if (fieldName == null) {
        throw new NullPointerException("fieldName");
    }
    try (Directory directory = FSDirectory.open(new File(indexName).toPath())) {
        final DirectoryReader ireader = DirectoryReader.open(directory);
        final List<LeafReaderContext> leaves = ireader.leaves();
        if (leaves.isEmpty()) {
            throw new IOException("empty leaf readers list");
        }
        final Terms terms = leaves.get(0).reader().terms(fieldName);
        /*final Terms terms = SlowCompositeReaderWrapper.wrap(ireader)
            .terms(fieldName);*/
        if (terms != null) {
            final TermsEnum tenum = terms.iterator();
            int pos = 0;
            // PostingsEnum penum = null;

            while (true) {
                final BytesRef br = tenum.next();
                if (br == null) {
                    break;
                }
                System.out.println((++pos) + ") term=[" + br.utf8ToString() + "] ");
                /*
                penum = tenum.postings(penum, PostingsEnum.OFFSETS);
                while (penum.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                System.out.print(" startOffset=" + penum.startOffset());
                System.out.println(" endOffset:" + penum.endOffset());
                }
                */
            }
        }
    }
}

From source file:br.pucminas.ri.jsearch.queryexpansion.RocchioQueryExpansion.java

License:Open Source License

private List<Entry<String, Float>> getTermScoreList(Directory directory)
        throws CorruptIndexException, IOException {

    Map<String, Float> termScoreMap = new HashMap<>();

    ConcreteTFIDFSimilarity sim = new ConcreteTFIDFSimilarity();

    try (IndexReader idxReader = DirectoryReader.open(directory)) {

        idxReader.leaves().stream().map((leaf) -> leaf.reader()).forEach((reader) -> {
            try {
                Terms terms = reader.terms(Constants.DOC_CONTENT);
                TermsEnum termsEnum = terms.iterator();
                PostingsEnum postings = null;
                int docsNum = idxReader.numDocs();

                BytesRef text;
                while ((text = termsEnum.next()) != null) {

                    postings = termsEnum.postings(postings);

                    while (postings.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                        int freq = postings.freq();
                        float tf = sim.tf(freq);
                        float idf = sim.idf(termsEnum.docFreq(), indexReader.numDocs());
                        termScoreMap.put(text.utf8ToString(), BETA * (tf * idf));
                    }//  w  w  w.  j a v a2 s  . co  m
                }

            } catch (IOException ex) {
                Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex);
            } finally {
                try {
                    idxReader.close();
                } catch (IOException ex) {
                    Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex);
                }
            }
        });

    }

    return new ArrayList<>(termScoreMap.entrySet());
}

From source file:br.pucminas.ri.jsearch.queryexpansion.RocchioQueryExpansion.java

License:Open Source License

private float getScore(Directory directory, String term) throws CorruptIndexException, IOException {

    try (IndexReader idxReader = DirectoryReader.open(directory)) {

        ConcreteTFIDFSimilarity sim = new ConcreteTFIDFSimilarity();

        for (LeafReaderContext context : idxReader.leaves()) {
            LeafReader reader = context.reader();

            try {
                Terms terms = reader.terms(Constants.DOC_CONTENT);
                TermsEnum termsEnum = terms.iterator();
                PostingsEnum postings = null;

                BytesRef text;
                while ((text = termsEnum.next()) != null) {
                    postings = termsEnum.postings(postings);
                    if (text.utf8ToString().equalsIgnoreCase(term)) {

                        while (postings.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                            int freq = postings.freq();
                            float tf = sim.tf(freq);
                            float idf = sim.idf(termsEnum.docFreq(), indexReader.numDocs());
                            return tf * idf;
                        }// w  w  w  .  ja  v  a2 s .  c o m
                    }
                }

            } catch (IOException ex) {
                Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex);
            }
        }

    }

    return 0;
}

From source file:br.ufmt.periscope.indexer.resources.search.FastJoinTermEnum.java

@Override
protected AcceptStatus accept(BytesRef term) throws IOException {
    if (ts.execute(this.name.utf8ToString(), term.utf8ToString()) != 0) {
        if (ts.fuzzyCosine() != 0 || ts.fuzzyDice() != 0 || ts.fuzzyJaccard() != 0) {
            return AcceptStatus.YES;
        } else {/*  w  ww . j av a2  s.c o m*/
            return AcceptStatus.NO;
        }
    } else {
        return AcceptStatus.NO_AND_SEEK;
    }
}