List of usage examples for org.apache.lucene.index Fields terms
public abstract Terms terms(String field) throws IOException;
From source file:org.codelibs.elasticsearch.common.lucene.search.MoreLikeThisQuery.java
License:Apache License
private void handleUnlike(XMoreLikeThis mlt, String[] unlikeText, Fields[] unlikeFields) throws IOException { Set<Term> skipTerms = new HashSet<>(); // handle like text if (unlikeText != null) { for (String text : unlikeText) { // only use the first field to be consistent String fieldName = moreLikeFields[0]; try (TokenStream ts = analyzer.tokenStream(fieldName, text)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset();/*from www .j a v a 2s. c om*/ while (ts.incrementToken()) { skipTerms.add(new Term(fieldName, termAtt.toString())); } ts.end(); } } } // handle like fields if (unlikeFields != null) { for (Fields fields : unlikeFields) { for (String fieldName : fields) { Terms terms = fields.terms(fieldName); final TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { skipTerms.add(new Term(fieldName, text.utf8ToString())); } } } } if (!skipTerms.isEmpty()) { mlt.setSkipTerms(skipTerms); } }
From source file:org.codelibs.elasticsearch.common.lucene.search.XMoreLikeThis.java
License:Apache License
/** * Return a query that will return docs like the passed Fields. * * @return a query that will return docs like the passed Fields. *///ww w.ja v a 2s. c om public Query like(Fields... likeFields) throws IOException { // get all field names Set<String> fieldNames = new HashSet<>(); for (Fields fields : likeFields) { for (String fieldName : fields) { fieldNames.add(fieldName); } } // term selection is per field, then appended to a single boolean query BooleanQuery.Builder bq = new BooleanQuery.Builder(); for (String fieldName : fieldNames) { Map<String, Int> termFreqMap = new HashMap<>(); for (Fields fields : likeFields) { Terms vector = fields.terms(fieldName); if (vector != null) { addTermFrequencies(termFreqMap, vector, fieldName); } } addToQuery(createQueue(termFreqMap, fieldName), bq); } return bq.build(); }
From source file:org.codelibs.elasticsearch.common.lucene.search.XMoreLikeThis.java
License:Apache License
/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms *///from w w w .j ava2 s. c o m private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField fields[] = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName); } } } else { addTermFrequencies(termFreqMap, vector, fieldName); } } return createQueue(termFreqMap); }
From source file:org.codelibs.elasticsearch.search.suggest.completion2x.AnalyzingCompletionLookupProvider.java
License:Apache License
@Override public FieldsConsumer consumer(final IndexOutput output) throws IOException { CodecUtil.writeHeader(output, CODEC_NAME, CODEC_VERSION_LATEST); return new FieldsConsumer() { private Map<String, Long> fieldOffsets = new HashMap<>(); @Override//from w ww .j a va 2s . c o m public void close() throws IOException { try { /* * write the offsets per field such that we know where * we need to load the FSTs from */ long pointer = output.getFilePointer(); output.writeVInt(fieldOffsets.size()); for (Map.Entry<String, Long> entry : fieldOffsets.entrySet()) { output.writeString(entry.getKey()); output.writeVLong(entry.getValue()); } output.writeLong(pointer); CodecUtil.writeFooter(output); } finally { IOUtils.close(output); } } @Override public void write(Fields fields) throws IOException { for (String field : fields) { Terms terms = fields.terms(field); if (terms == null) { continue; } terms.iterator(); new SuggestPayload(); throw new UnsupportedOperationException("QueryBuilders does not support this operation."); // final XAnalyzingSuggester.XBuilder builder = new XAnalyzingSuggester.XBuilder( // maxSurfaceFormsPerAnalyzedForm, hasPayloads, XAnalyzingSuggester.PAYLOAD_SEP); // int docCount = 0; // while (true) { // BytesRef term = termsEnum.next(); // if (term == null) { // break; // } // docsEnum = termsEnum.postings(docsEnum, PostingsEnum.PAYLOADS); // builder.startTerm(term); // int docFreq = 0; // while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { // for (int i = 0; i < docsEnum.freq(); i++) { // final int position = docsEnum.nextPosition(); // AnalyzingCompletionLookupProvider.this.parsePayload(docsEnum.getPayload(), spare); // builder.addSurface(spare.surfaceForm.get(), spare.payload.get(), spare.weight); // // multi fields have the same surface form so we sum up here // maxAnalyzedPathsForOneInput = Math.max(maxAnalyzedPathsForOneInput, position + 1); // } // docFreq++; // docCount = Math.max(docCount, docsEnum.docID()+1); // } // builder.finishTerm(docFreq); // } // /* // * Here we are done processing the field and we can // * buid the FST and write it to disk. // */ // FST<Pair<Long, BytesRef>> build = builder.build(); // assert build != null || docCount == 0: "the FST is null but docCount is != 0 actual value: [" + docCount + "]"; // /* // * it's possible that the FST is null if we have 2 segments that get merged // * and all docs that have a value in this field are deleted. This will cause // * a consumer to be created but it doesn't consume any values causing the FSTBuilder // * to return null. // */ // if (build != null) { // fieldOffsets.put(field, output.getFilePointer()); // build.save(output); // /* write some more meta-info */ // output.writeVInt(maxAnalyzedPathsForOneInput); // output.writeVInt(maxSurfaceFormsPerAnalyzedForm); // output.writeInt(maxGraphExpansions); // can be negative // int options = 0; // options |= preserveSep ? SERIALIZE_PRESERVE_SEPARATORS : 0; // options |= hasPayloads ? SERIALIZE_HAS_PAYLOADS : 0; // options |= preservePositionIncrements ? SERIALIZE_PRESERVE_POSITION_INCREMENTS : 0; // output.writeVInt(options); // output.writeVInt(XAnalyzingSuggester.SEP_LABEL); // output.writeVInt(XAnalyzingSuggester.END_BYTE); // output.writeVInt(XAnalyzingSuggester.PAYLOAD_SEP); // output.writeVInt(XAnalyzingSuggester.HOLE_CHARACTER); // } } } }; }
From source file:org.codelibs.elasticsearch.search.suggest.completion2x.Completion090PostingsFormat.java
License:Apache License
/** * Returns total in-heap bytes used by all suggesters. This method has CPU cost <code>O(numIndexedFields)</code>. * * @param fieldNamePatterns if non-null, any completion field name matching any of these patterns will break out its in-heap bytes * separately in the returned {CompletionStats} *///from w w w.j ava2 s . c om public CompletionStats completionStats(IndexReader indexReader, String... fieldNamePatterns) { CompletionStats completionStats = new CompletionStats(); for (LeafReaderContext atomicReaderContext : indexReader.leaves()) { LeafReader atomicReader = atomicReaderContext.reader(); try { Fields fields = atomicReader.fields(); for (String fieldName : fields) { Terms terms = fields.terms(fieldName); if (terms instanceof CompletionTerms) { CompletionTerms completionTerms = (CompletionTerms) terms; completionStats.add(completionTerms.stats(fieldNamePatterns)); } } } catch (IOException ioe) { logger.error("Could not get completion stats", ioe); } } return completionStats; }
From source file:org.dice.solrenhancements.morelikethis.MoreLikeThis.java
License:Apache License
/** * Find words for a more-queryFromDocuments-this query former. * * @param docNum the id of the lucene document from which to find terms * @param fields the list of field of the lucene document from which to extract terms * @param fieldToTermFreqMap data structure to populate with term frequencies *//*from w w w. ja va2s. com*/ public Map<String, Map<String, Flt>> retrieveTerms(int docNum, String[] fields, Map<String, Map<String, Flt>> fieldToTermFreqMap) throws IOException { if (fieldToTermFreqMap == null) { fieldToTermFreqMap = new HashMap<String, Map<String, Flt>>(); } if (fields == null || fields.length == 0) { return fieldToTermFreqMap; } final Fields vectors = ir.getTermVectors(docNum); final Document document = ir.document(docNum); for (String fieldName : fields) { Map<String, Flt> termFreqMap = null; if (fieldToTermFreqMap.containsKey(fieldName)) { termFreqMap = fieldToTermFreqMap.get(fieldName); } else { termFreqMap = new HashMap<String, Flt>(); fieldToTermFreqMap.put(fieldName, termFreqMap); } Terms vector = null; if (vectors != null) { vector = vectors.terms(fieldName); } // field does not store term vector info // even if term vectors enabled, need to extract payload from regular field reader if (vector == null || isPayloadField(fieldName)) { IndexableField docFields[] = document.getFields(fieldName); for (IndexableField field : docFields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermWeights(new StringReader(stringValue), termFreqMap, fieldName); } } } else { addTermWeights(termFreqMap, vector); } } return fieldToTermFreqMap; }
From source file:org.dice.solrenhancements.unsupervisedfeedback.UnsupervisedFeedback.java
License:Apache License
public Map<String, Map<String, Flt>> retrieveTerms(int docNum, Map<String, Map<String, Flt>> fieldToTermFreqMap) throws IOException { if (fieldToTermFreqMap == null) { fieldToTermFreqMap = new HashMap<String, Map<String, Flt>>(); }//from w w w . ja v a 2s. com for (String fieldName : getFieldNames()) { Map<String, Flt> termFreqMap = null; if (fieldToTermFreqMap.containsKey(fieldName)) { termFreqMap = fieldToTermFreqMap.get(fieldName); } else { termFreqMap = new HashMap<String, Flt>(); fieldToTermFreqMap.put(fieldName, termFreqMap); } final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info // even if term vectors enabled, need to extract payload from regular field reader if (vector == null || isPayloadField(fieldName)) { Document d = ir.document(docNum); IndexableField fields[] = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermWeights(new StringReader(stringValue), termFreqMap, fieldName); } } } else { addTermWeights(termFreqMap, vector); } } return fieldToTermFreqMap; }
From source file:org.dkpro.tc.features.ngram.LuceneNgramDocumentTest.java
License:Apache License
private Set<String> getTokensFromIndex(File luceneFolder) throws Exception { Set<String> token = new HashSet<>(); @SuppressWarnings("deprecation") IndexReader idxReader = IndexReader.open(FSDirectory.open(luceneFolder)); Fields fields = MultiFields.getFields(idxReader); for (String field : fields) { if (field.equals("id")) { continue; }/* w w w. j a v a 2 s. c om*/ Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { token.add(text.utf8ToString()); } } return token; }
From source file:org.dkpro.tc.features.ngram.meta.LuceneCharacterNGramMetaCollectorTest.java
License:Apache License
@SuppressWarnings("unused") @Test/*from w ww . ja v a2 s .c om*/ public void luceneCharacterNgramMetaCollectorTest() throws Exception { File tmpDir = folder.newFolder(); CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_PATTERNS, "charMetaCollectorTest.txt"); AnalysisEngineDescription segmenter = AnalysisEngineFactory .createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT); AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription( LuceneCharacterNGramMetaCollector.class, LuceneCharacterNGram.PARAM_UNIQUE_EXTRACTOR_NAME, "123", LuceneCharacterNGram.PARAM_NGRAM_MIN_N, 2, LuceneCharacterNGramMetaCollector.PARAM_TARGET_LOCATION, tmpDir, LuceneCharacterNGram.PARAM_SOURCE_LOCATION, tmpDir); for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) { // System.out.println(jcas.getDocumentText().length()); } Set<String> freq2terms = new HashSet<>(); int i = 0; IndexReader index; try { index = DirectoryReader.open(FSDirectory.open(tmpDir)); Fields fields = MultiFields.getFields(index); if (fields != null) { Terms terms = fields.terms(LuceneCharacterNGramMetaCollector.LUCENE_CHAR_NGRAM_FIELD + "123"); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { if (termsEnum.totalTermFreq() == 2) { freq2terms.add(text.utf8ToString()); } // System.out.println(text.utf8ToString() + " " + termsEnum.totalTermFreq()); i++; } } } } catch (Exception e) { throw new ResourceInitializationException(e); } assertEquals(10, i); assertEquals(1, freq2terms.size()); }
From source file:org.dkpro.tc.features.ngram.meta.LuceneNGramMetaCollectorTest.java
License:Apache License
@Test public void luceneNgramMetaCollectorTest() throws Exception { File tmpDir = folder.newFolder(); CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_PATTERNS, "text*.txt"); AnalysisEngineDescription segmenter = AnalysisEngineFactory .createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription doc = AnalysisEngineFactory.createEngineDescription(DocumentModeAnnotator.class, DocumentModeAnnotator.PARAM_FEATURE_MODE, Constants.FM_DOCUMENT); AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription( LuceneNGramMetaCollector.class, LuceneNGramMetaCollector.PARAM_TARGET_LOCATION, tmpDir, LuceneNGramMetaCollector.PARAM_UNIQUE_EXTRACTOR_NAME, UNIQUE_FEATURE_NAME); for (JCas jcas : new JCasIterable(reader, segmenter, doc, metaCollector)) { System.out.println(jcas.getDocumentText().length()); }/*from ww w . j a va 2 s. c o m*/ int i = 0; IndexReader index; try { index = DirectoryReader.open(FSDirectory.open(tmpDir)); Fields fields = MultiFields.getFields(index); if (fields != null) { Terms terms = fields.terms(LuceneNGram.LUCENE_NGRAM_FIELD + UNIQUE_FEATURE_NAME); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { if (text.utf8ToString().equals("this")) { assertEquals(2, termsEnum.docFreq()); assertEquals(3, termsEnum.totalTermFreq()); } i++; } } } } catch (Exception e) { throw new ResourceInitializationException(e); } assertEquals(35, i); }