List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.NewsItemToTermsBolt.java
License:Apache License
private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap, String id, String field, double weight) throws IOException { Query query = new TermQuery(new Term("id", id)); TopDocs topdocs = searcher.search(query, 1); if (topdocs.totalHits > 0) { int docNr = topdocs.scoreDocs[0].doc; Terms vector = reader.getTermVector(docNr, field); if (vector != null) { TermsEnum termsEnum;/*from w w w. j a va 2s .c o m*/ termsEnum = vector.iterator(TermsEnum.EMPTY); BytesRef text; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int docFreq = reader.docFreq(new Term(field, text)); // ignore really rare terms and really common terms double minFreq = reader.numDocs() * 0.0001; double maxFreq = reader.numDocs() / 3; //double minFreq = 0; //double maxFreq = Double.MAX_VALUE; if (docFreq > minFreq && docFreq < maxFreq) { double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field); double idf = Math.log((double) reader.numDocs() / docFreq); if (!Double.isInfinite(idf)) { if (!termMap.containsKey(term)) { termMap.put(term, tf * idf * weight); } else { termMap.put(term, termMap.get(term) + tf * idf * weight); } } } } } else { logger.debug("no term available for doc=" + docNr + " and field=" + field); } } else { logger.warn("No documents found with id=" + id); } }
From source file:be.ugent.tiwi.sleroux.newsrec.stormNewsFetch.storm.bolts.NewsItemToTermsBolt.java
License:Apache License
private void updateTermMap(DirectoryReader reader, IndexSearcher searcher, Map<String, Double> termMap, long id, String field, double weight) throws IOException { Query query = NumericRangeQuery.newLongRange("id", id, id, true, true); TopDocs topdocs = searcher.search(query, 1); if (topdocs.totalHits > 0) { int docNr = topdocs.scoreDocs[0].doc; Terms vector = reader.getTermVector(docNr, field); if (vector != null) { TermsEnum termsEnum;/*from w ww . j ava 2 s .c o m*/ termsEnum = vector.iterator(TermsEnum.EMPTY); BytesRef text; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int docFreq = reader.docFreq(new Term(field, text)); // ignore really rare terms and really common terms //double minFreq = reader.numDocs() * 0.0001; //double maxFreq = reader.numDocs() / 3; double minFreq = 0; double maxFreq = Double.MAX_VALUE; if (docFreq > minFreq && docFreq < maxFreq) { double tf = 1 + ((double) termsEnum.totalTermFreq()) / reader.getSumTotalTermFreq(field); double idf = Math.log((double) reader.numDocs() / docFreq); if (!Double.isInfinite(idf)) { if (!termMap.containsKey(term)) { termMap.put(term, tf * idf * weight); } else { termMap.put(term, termMap.get(term) + tf * idf * weight); } } } } } else { logger.debug("no term available for doc=" + docNr + " and field=" + field); } } else { logger.warn("No documents found with id=" + id); } }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected Map<String, int[]> parseD1Index(IndexReader d1Index, IndexReader d2Index) { try {/*from w w w . j av a2s. c o m*/ int[] documentIds = getDocumentIds(d1Index); final Map<String, int[]> hashedBlocks = new HashMap<>(); Fields fields = MultiFields.getFields(d1Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { // check whether it is a common term int d2DocFrequency = d2Index.docFreq(new Term(field, text)); if (d2DocFrequency == 0) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); hashedBlocks.put(text.utf8ToString(), idsArray); } } return hashedBlocks; } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); return null; } }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected void parseD2Index(IndexReader d2Index, Map<String, int[]> hashedBlocks) { try {//from w w w .j a va2s . c om int[] documentIds = getDocumentIds(d2Index); Fields fields = MultiFields.getFields(d2Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { if (!hashedBlocks.containsKey(text.utf8ToString())) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d2Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); int[] d1Entities = hashedBlocks.get(text.utf8ToString()); blocks.add(new BilateralBlock(d1Entities, idsArray)); } } } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); } }
From source file:BlockBuilding.AbstractSortedNeighborhoodBlocking.java
License:Open Source License
protected Set<String> getTerms(IndexReader iReader) { Set<String> sortedTerms = new HashSet<>(); try {/*from w w w . j a va2 s .c om*/ Fields fields = MultiFields.getFields(iReader); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { sortedTerms.add(text.utf8ToString()); } } } catch (IOException ex) { ex.printStackTrace(); } return sortedTerms; }
From source file:BlockBuilding.SortedNeighborhoodBlocking.java
License:Apache License
protected Set<String> getTerms(IndexReader iReader) { Set<String> sortedTerms = new HashSet<>(); try {/*from www . ja v a 2 s . c om*/ Fields fields = MultiFields.getFields(iReader); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { sortedTerms.add(text.utf8ToString()); } } } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); } return sortedTerms; }
From source file:br.bireme.ngrams.Tools.java
public static void showTerms(final String indexName, final String fieldName) throws IOException { if (indexName == null) { throw new NullPointerException("indexName"); }//from w ww.j a va 2s . co m if (fieldName == null) { throw new NullPointerException("fieldName"); } try (Directory directory = FSDirectory.open(new File(indexName).toPath())) { final DirectoryReader ireader = DirectoryReader.open(directory); final List<LeafReaderContext> leaves = ireader.leaves(); if (leaves.isEmpty()) { throw new IOException("empty leaf readers list"); } final Terms terms = leaves.get(0).reader().terms(fieldName); /*final Terms terms = SlowCompositeReaderWrapper.wrap(ireader) .terms(fieldName);*/ if (terms != null) { final TermsEnum tenum = terms.iterator(); int pos = 0; // PostingsEnum penum = null; while (true) { final BytesRef br = tenum.next(); if (br == null) { break; } System.out.println((++pos) + ") term=[" + br.utf8ToString() + "] "); /* penum = tenum.postings(penum, PostingsEnum.OFFSETS); while (penum.nextDoc() != PostingsEnum.NO_MORE_DOCS) { System.out.print(" startOffset=" + penum.startOffset()); System.out.println(" endOffset:" + penum.endOffset()); } */ } } } }
From source file:br.pucminas.ri.jsearch.queryexpansion.RocchioQueryExpansion.java
License:Open Source License
private List<Entry<String, Float>> getTermScoreList(Directory directory) throws CorruptIndexException, IOException { Map<String, Float> termScoreMap = new HashMap<>(); ConcreteTFIDFSimilarity sim = new ConcreteTFIDFSimilarity(); try (IndexReader idxReader = DirectoryReader.open(directory)) { idxReader.leaves().stream().map((leaf) -> leaf.reader()).forEach((reader) -> { try { Terms terms = reader.terms(Constants.DOC_CONTENT); TermsEnum termsEnum = terms.iterator(); PostingsEnum postings = null; int docsNum = idxReader.numDocs(); BytesRef text; while ((text = termsEnum.next()) != null) { postings = termsEnum.postings(postings); while (postings.nextDoc() != PostingsEnum.NO_MORE_DOCS) { int freq = postings.freq(); float tf = sim.tf(freq); float idf = sim.idf(termsEnum.docFreq(), indexReader.numDocs()); termScoreMap.put(text.utf8ToString(), BETA * (tf * idf)); }// w w w. j a v a2 s . co m } } catch (IOException ex) { Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex); } finally { try { idxReader.close(); } catch (IOException ex) { Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex); } } }); } return new ArrayList<>(termScoreMap.entrySet()); }
From source file:br.pucminas.ri.jsearch.queryexpansion.RocchioQueryExpansion.java
License:Open Source License
private float getScore(Directory directory, String term) throws CorruptIndexException, IOException { try (IndexReader idxReader = DirectoryReader.open(directory)) { ConcreteTFIDFSimilarity sim = new ConcreteTFIDFSimilarity(); for (LeafReaderContext context : idxReader.leaves()) { LeafReader reader = context.reader(); try { Terms terms = reader.terms(Constants.DOC_CONTENT); TermsEnum termsEnum = terms.iterator(); PostingsEnum postings = null; BytesRef text; while ((text = termsEnum.next()) != null) { postings = termsEnum.postings(postings); if (text.utf8ToString().equalsIgnoreCase(term)) { while (postings.nextDoc() != PostingsEnum.NO_MORE_DOCS) { int freq = postings.freq(); float tf = sim.tf(freq); float idf = sim.idf(termsEnum.docFreq(), indexReader.numDocs()); return tf * idf; }// w w w . ja v a2 s . c o m } } } catch (IOException ex) { Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex); } } } return 0; }
From source file:br.ufmt.periscope.indexer.resources.search.FastJoinTermEnum.java
@Override protected AcceptStatus accept(BytesRef term) throws IOException { if (ts.execute(this.name.utf8ToString(), term.utf8ToString()) != 0) { if (ts.fuzzyCosine() != 0 || ts.fuzzyDice() != 0 || ts.fuzzyJaccard() != 0) { return AcceptStatus.YES; } else {/* w ww . j av a2 s.c o m*/ return AcceptStatus.NO; } } else { return AcceptStatus.NO_AND_SEEK; } }