List of usage examples for org.apache.lucene.index Terms iterator
public abstract TermsEnum iterator() throws IOException;
From source file:SimpleNaiveBayesClassifier.java
License:Apache License
/** * Calculate probabilities for all classes for a given input text * @param inputDocument the input text as a {@code String} * @return a {@code List} of {@code ClassificationResult}, one for each existing class * @throws IOException if assigning probabilities fails *///from ww w.j a v a2 s . c o m protected List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException { List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>(); Terms classes = MultiFields.getTerms(leafReader, classFieldName); TermsEnum classesEnum = classes.iterator(); BytesRef next; String[] tokenizedText = tokenize(inputDocument); int docsWithClassSize = countDocsWithClass(); while ((next = classesEnum.next()) != null) { if (next.length > 0) { Term term = new Term(this.classFieldName, next); double clVal = calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(tokenizedText, term, docsWithClassSize); assignedClasses.add(new ClassificationResult<>(term.bytes(), clVal)); } } // normalization; the values transforms to a 0-1 range return normClassificationResults(assignedClasses); }
From source file:SimpleNaiveBayesDocumentClassifier.java
License:Apache License
private List<ClassificationResult<BytesRef>> assignNormClasses(Document inputDocument) throws IOException { List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>(); Map<String, List<String[]>> fieldName2tokensArray = new LinkedHashMap<>(); Map<String, Float> fieldName2boost = new LinkedHashMap<>(); Terms classes = MultiFields.getTerms(leafReader, classFieldName); TermsEnum classesEnum = classes.iterator(); BytesRef c;/*from w w w. ja va2 s . c o m*/ analyzeSeedDocument(inputDocument, fieldName2tokensArray, fieldName2boost); int docsWithClassSize = countDocsWithClass(); while ((c = classesEnum.next()) != null) { double classScore = 0; Term term = new Term(this.classFieldName, c); for (String fieldName : textFieldNames) { List<String[]> tokensArrays = fieldName2tokensArray.get(fieldName); double fieldScore = 0; for (String[] fieldTokensArray : tokensArrays) { fieldScore += calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(fieldTokensArray, fieldName, term, docsWithClassSize) * fieldName2boost.get(fieldName); } classScore += fieldScore; } assignedClasses.add(new ClassificationResult<>(term.bytes(), classScore)); } return normClassificationResults(assignedClasses); }
From source file:alix.lucene.MoreLikeThis.java
License:Apache License
/** * Adds terms and frequencies found in vector into the Map termFreqMap * * @param termFreqMap a Map of terms and their frequencies * @param vector List of terms and their frequencies for a doc/field *//*from w w w. jav a2 s.c o m*/ private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException { final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); final String term = spare.toString(); if (isNoiseWord(term)) { continue; } final int freq = (int) termsEnum.totalTermFreq(); // increment frequency Int cnt = termFreqMap.get(term); if (cnt == null) { cnt = new Int(); termFreqMap.put(term, cnt); cnt.x = freq; } else { cnt.x += freq; } } }
From source file:alix.lucene.MoreLikeThis.java
License:Apache License
/** * Print a term vector for debugging//from w ww . j a v a 2s . co m * * @param vector List of terms and their frequencies for a doc/field * @throws IOException */ @SuppressWarnings("unused") private void print(Terms vector) throws IOException { if (vector == null) return; final TermsEnum termsEnum = vector.iterator(); final CharsRefBuilder spare = new CharsRefBuilder(); BytesRef text; // termsEnum.docFreq() = 1, // The returned Fields instance acts like a single-document inverted index HashMap<String, Long> map = new HashMap<String, Long>(); while ((text = termsEnum.next()) != null) { spare.copyUTF8Bytes(text); map.put(spare.toString(), termsEnum.totalTermFreq()); } @SuppressWarnings("unchecked") Map.Entry<String, Long>[] a = map.entrySet().toArray(new Map.Entry[0]); Arrays.sort(a, new Comparator<Map.Entry<String, Long>>() { public int compare(Map.Entry<String, Long> o1, Map.Entry<String, Long> o2) { return o2.getValue().compareTo(o1.getValue()); } }); for (Map.Entry<String, Long> e : a) { System.out.print(e.getKey() + ":" + e.getValue() + " "); } System.out.println(); }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected Map<String, int[]> parseD1Index(IndexReader d1Index, IndexReader d2Index) { try {/*from w ww . ja va2 s . c om*/ int[] documentIds = getDocumentIds(d1Index); final Map<String, int[]> hashedBlocks = new HashMap<>(); Fields fields = MultiFields.getFields(d1Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { // check whether it is a common term int d2DocFrequency = d2Index.docFreq(new Term(field, text)); if (d2DocFrequency == 0) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); hashedBlocks.put(text.utf8ToString(), idsArray); } } return hashedBlocks; } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); return null; } }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected void parseD2Index(IndexReader d2Index, Map<String, int[]> hashedBlocks) { try {/*from w w w . j av a2s.c o m*/ int[] documentIds = getDocumentIds(d2Index); Fields fields = MultiFields.getFields(d2Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { if (!hashedBlocks.containsKey(text.utf8ToString())) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d2Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); int[] d1Entities = hashedBlocks.get(text.utf8ToString()); blocks.add(new BilateralBlock(d1Entities, idsArray)); } } } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); } }
From source file:BlockBuilding.AbstractBlockBuilding.java
License:Apache License
protected void parseIndex(IndexReader d1Index) { try {//from w ww . ja v a 2s . c o m int[] documentIds = getDocumentIds(d1Index); Fields fields = MultiFields.getFields(d1Index); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { if (termsEnum.docFreq() < 2) { continue; } final List<Integer> entityIds = new ArrayList<>(); PostingsEnum pe = MultiFields.getTermDocsEnum(d1Index, field, text); int doc; while ((doc = pe.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { entityIds.add(documentIds[doc]); } int[] idsArray = Converter.convertCollectionToArray(entityIds); UnilateralBlock block = new UnilateralBlock(idsArray); blocks.add(block); } } } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); } }
From source file:BlockBuilding.SortedNeighborhoodBlocking.java
License:Apache License
protected Set<String> getTerms(IndexReader iReader) { Set<String> sortedTerms = new HashSet<>(); try {/*from w w w. j a v a 2 s. c o m*/ Fields fields = MultiFields.getFields(iReader); for (String field : fields) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { sortedTerms.add(text.utf8ToString()); } } } catch (IOException ex) { LOGGER.log(Level.SEVERE, null, ex); } return sortedTerms; }
From source file:br.bireme.ngrams.Tools.java
public static void showTerms(final String indexName, final String fieldName) throws IOException { if (indexName == null) { throw new NullPointerException("indexName"); }//from ww w . j a v a 2s . com if (fieldName == null) { throw new NullPointerException("fieldName"); } try (Directory directory = FSDirectory.open(new File(indexName).toPath())) { final DirectoryReader ireader = DirectoryReader.open(directory); final List<LeafReaderContext> leaves = ireader.leaves(); if (leaves.isEmpty()) { throw new IOException("empty leaf readers list"); } final Terms terms = leaves.get(0).reader().terms(fieldName); /*final Terms terms = SlowCompositeReaderWrapper.wrap(ireader) .terms(fieldName);*/ if (terms != null) { final TermsEnum tenum = terms.iterator(); int pos = 0; // PostingsEnum penum = null; while (true) { final BytesRef br = tenum.next(); if (br == null) { break; } System.out.println((++pos) + ") term=[" + br.utf8ToString() + "] "); /* penum = tenum.postings(penum, PostingsEnum.OFFSETS); while (penum.nextDoc() != PostingsEnum.NO_MORE_DOCS) { System.out.print(" startOffset=" + penum.startOffset()); System.out.println(" endOffset:" + penum.endOffset()); } */ } } } }
From source file:br.pucminas.ri.jsearch.queryexpansion.RocchioQueryExpansion.java
License:Open Source License
private List<Entry<String, Float>> getTermScoreList(Directory directory) throws CorruptIndexException, IOException { Map<String, Float> termScoreMap = new HashMap<>(); ConcreteTFIDFSimilarity sim = new ConcreteTFIDFSimilarity(); try (IndexReader idxReader = DirectoryReader.open(directory)) { idxReader.leaves().stream().map((leaf) -> leaf.reader()).forEach((reader) -> { try { Terms terms = reader.terms(Constants.DOC_CONTENT); TermsEnum termsEnum = terms.iterator(); PostingsEnum postings = null; int docsNum = idxReader.numDocs(); BytesRef text;//from www . jav a 2 s .c o m while ((text = termsEnum.next()) != null) { postings = termsEnum.postings(postings); while (postings.nextDoc() != PostingsEnum.NO_MORE_DOCS) { int freq = postings.freq(); float tf = sim.tf(freq); float idf = sim.idf(termsEnum.docFreq(), indexReader.numDocs()); termScoreMap.put(text.utf8ToString(), BETA * (tf * idf)); } } } catch (IOException ex) { Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex); } finally { try { idxReader.close(); } catch (IOException ex) { Logger.getLogger(RocchioQueryExpansion.class.getName()).log(Level.SEVERE, null, ex); } } }); } return new ArrayList<>(termScoreMap.entrySet()); }