List of usage examples for org.apache.lucene.index Fields terms
public abstract Terms terms(String field) throws IOException;
From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramCPMetaCollectorTest.java
License:Apache License
@Test public void combinedNgramPairMetaCollectorTest() throws Exception { File tmpDir = folder.newFolder(); CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt"); AnalysisEngineDescription segmenter = AnalysisEngineFactory .createEngineDescription(BreakIteratorSegmenter.class); AggregateBuilder builder = new AggregateBuilder(); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO); AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription( LuceneNGramCPMetaCollector.class, LuceneNGramCPFE.PARAM_LUCENE_DIR, tmpDir); // test fails if for-loop removed for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) { // System.out.println(jcas.getDocumentText().length()); }//from w w w.j a va 2s .co m int i = 0; IndexReader index; try { index = DirectoryReader.open(FSDirectory.open(tmpDir)); Fields fields = MultiFields.getFields(index); if (fields != null) { Terms terms = fields.terms(LuceneNGramCPFE.LUCENE_NGRAM_FIELDCOMBO); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { // System.out.println(text.utf8ToString() + " - " + // termsEnum.totalTermFreq()); // System.out.println(termsEnum.docFreq()); // if there were multiple instances of the same ngram, // then this would be relevant if (text.utf8ToString().equals("mice_ANDcats_.")) { assertEquals(1, termsEnum.docFreq()); assertEquals(1, termsEnum.totalTermFreq()); } i++; } } } } catch (Exception e) { throw new ResourceInitializationException(e); } assertEquals(65, i); }
From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramPMetaCollectorTest.java
License:Apache License
@Test public void lucenePairNgramMetaCollectorTest() throws Exception { File tmpDir = folder.newFolder(); CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt"); AnalysisEngineDescription segmenter = AnalysisEngineFactory .createEngineDescription(BreakIteratorSegmenter.class); AggregateBuilder builder = new AggregateBuilder(); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO); AnalysisEngineDescription metaCollector = AnalysisEngineFactory .createEngineDescription(LuceneNGramPMetaCollector.class, LuceneNGramPFE.PARAM_LUCENE_DIR, tmpDir); // test fails if for-loop removed for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) { // System.out.println(jcas.getDocumentText().length()); }/*w w w .j av a2 s. c o m*/ int i = 0; IndexReader index; try { index = DirectoryReader.open(FSDirectory.open(tmpDir)); Fields fields = MultiFields.getFields(index); if (fields != null) { Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { // System.out.println(text.utf8ToString() + " - " + // termsEnum.totalTermFreq()); // System.out.println(termsEnum.docFreq()); if (text.utf8ToString().equals("this")) { assertEquals(2, termsEnum.docFreq()); assertEquals(3, termsEnum.totalTermFreq()); } i++; } } } } catch (Exception e) { throw new ResourceInitializationException(e); } assertEquals(16, i); }
From source file:de.unihildesheim.iw.lucene.index.FDRIndexDataProvider.java
License:Open Source License
@SuppressFBWarnings({ "EXS_EXCEPTION_SOFTENING_NO_CONSTRAINTS", "EXS_EXCEPTION_SOFTENING_NO_CHECKED" }) @Override/* w w w. j a va2 s. c om*/ public Stream<BytesRef> getDocumentTerms(final int docId, @NotNull final String... field) { Arrays.sort(field); final Fields fields; try { fields = this.index.reader.getTermVectors(docId); } catch (final IOException e) { throw new UncheckedIOException(e); } if (fields == null) { return Stream.empty(); } final BytesRefHash terms = new BytesRefHash(); StreamSupport.stream(fields.spliterator(), false) // filter for required fields .filter(fn -> Arrays.binarySearch(field, fn) >= 0).map(fn -> { try { return fields.terms(fn); } catch (final IOException e) { throw new UncheckedIOException(e); } }).filter(t -> t != null).forEach(t -> { try { final TermsEnum te = t.iterator(null); BytesRef term; while ((term = te.next()) != null) { terms.add(term); } } catch (final IOException e) { throw new UncheckedIOException(e); } }); return StreamUtils.stream(terms); }
From source file:de.unihildesheim.iw.lucene.index.FilteredDirectoryReaderTest.java
License:Open Source License
@Test public void testTermsEnum_totalTermFreq() throws Exception { try (TestMemIndex idx = new TestMemIndex(Index.ALL_FIELDS)) { final DirectoryReader reader = DirectoryReader.open(idx.dir); final FilteredDirectoryReader fReader = new Builder(reader).fields(Collections.singleton("f2")).build(); fReader.getSubReaders().forEach(r -> { final Fields f = r.fields(); f.forEach(fld -> {// w ww . ja v a2s . c om try { final Terms t = f.terms(fld); final TermsEnum te = t.iterator(null); while (te.next() != null) { te.totalTermFreq(); } } catch (final IOException e) { throw new UncheckedIOException(e); } }); }); } }
From source file:de.unihildesheim.iw.lucene.query.RelaxableCommonTermsQuery.java
License:Open Source License
/** * New instance using settings from the supplied {@link Builder} instance. * * @param builder {@link Builder} Instance builder * @throws IOException Thrown on low-level i/o-errors *//*from w w w .j a v a 2 s . c om*/ @SuppressWarnings({ "ObjectAllocationInLoop", "ObjectEquality" }) RelaxableCommonTermsQuery(@NotNull final Builder builder) throws IOException { // get all query terms assert builder.queryStr != null; assert builder.analyzer != null; this.queryTerms = QueryUtils.tokenizeQueryString(builder.queryStr, builder.analyzer); // list of unique terms contained in the query (stopped, analyzed) final String[] uniqueQueryTerms = this.queryTerms.stream().distinct().toArray(String[]::new); final int uniqueTermsCount = uniqueQueryTerms.length; // heavily based on code from org.apache.lucene.queries.CommonTermsQuery assert builder.reader != null; final List<LeafReaderContext> leaves = builder.reader.leaves(); final int maxDoc = builder.reader.maxDoc(); TermsEnum termsEnum = null; final List<Query> subQueries = new ArrayList<>(10); assert builder.fields != null; for (final String field : builder.fields) { final TermContext[] tcArray = new TermContext[uniqueTermsCount]; final BooleanQuery lowFreq = new BooleanQuery(); final BooleanQuery highFreq = new BooleanQuery(); // collect term statistics for (int i = 0; i < uniqueTermsCount; i++) { final Term term = new Term(field, uniqueQueryTerms[i]); for (final LeafReaderContext context : leaves) { final TermContext termContext = tcArray[i]; final Fields fields = context.reader().fields(); final Terms terms = fields.terms(field); if (terms != null) { // only, if field exists termsEnum = terms.iterator(termsEnum); if (termsEnum != TermsEnum.EMPTY) { if (termsEnum.seekExact(term.bytes())) { if (termContext == null) { tcArray[i] = new TermContext(builder.reader.getContext(), termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } else { termContext.register(termsEnum.termState(), context.ord, termsEnum.docFreq(), termsEnum.totalTermFreq()); } } } } } // build query if (tcArray[i] == null) { lowFreq.add(new TermQuery(term), builder.lowFreqOccur); } else { if ((builder.maxTermFrequency >= 1f && (float) tcArray[i].docFreq() > builder.maxTermFrequency) || (tcArray[i].docFreq() > (int) Math .ceil((double) (builder.maxTermFrequency * (float) maxDoc)))) { highFreq.add(new TermQuery(term, tcArray[i]), builder.highFreqOccur); } else { lowFreq.add(new TermQuery(term, tcArray[i]), builder.lowFreqOccur); } } final int numLowFreqClauses = lowFreq.clauses().size(); final int numHighFreqClauses = highFreq.clauses().size(); if (builder.lowFreqOccur == Occur.SHOULD && numLowFreqClauses > 0) { lowFreq.setMinimumNumberShouldMatch(numLowFreqClauses); } if (builder.highFreqOccur == Occur.SHOULD && numHighFreqClauses > 0) { highFreq.setMinimumNumberShouldMatch(numHighFreqClauses); } } if (LOG.isDebugEnabled()) { LOG.debug("qLF={}", lowFreq); LOG.debug("qHF={}", highFreq); } if (lowFreq.clauses().isEmpty()) { subQueries.add(highFreq); } else if (highFreq.clauses().isEmpty()) { subQueries.add(lowFreq); } else { final BooleanQuery query = new BooleanQuery(true); // final query query.add(highFreq, Occur.SHOULD); query.add(lowFreq, Occur.MUST); subQueries.add(query); } } if (LOG.isDebugEnabled()) { LOG.debug("qList={}", subQueries); } this.query = subQueries.size() == 1 ? subQueries.get(0) : new DisjunctionMaxQuery(subQueries, 0.1f); if (LOG.isDebugEnabled()) { LOG.debug("RCTQ {} uQt={}", this.query, uniqueQueryTerms); } }
From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.lucene.CorpusHighFreqTerms.java
License:Open Source License
/** * * @param reader//from www . j a v a2 s . c o m * @param numTerms * @param field * @return TermStats[] ordered by terms with highest docFreq first. * @throws Exception */ public static TermStats[] getHighFreqTerms(IndexReader reader, int numTerms, String field) throws Exception { TermStatsQueue tiq = null; if (field != null) { Fields fields = MultiFields.getFields(reader); if (fields == null) { throw new RuntimeException("field " + field + " not found"); } Terms terms = fields.terms(field); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); tiq = new TermStatsQueue(numTerms); tiq.fill(field, termsEnum); } } else { Fields fields = MultiFields.getFields(reader); if (fields == null) { throw new RuntimeException("no fields found for this index"); } tiq = new TermStatsQueue(numTerms); FieldsEnum fieldsEnum = fields.iterator(); while (true) { field = fieldsEnum.next(); if (field != null) { Terms terms = fieldsEnum.terms(); if (terms != null) { tiq.fill(field, terms.iterator(null)); } } else { break; } } } TermStats[] result = new TermStats[tiq.size()]; // we want highest first so we read the queue and populate the array // starting at the end and work backwards int count = tiq.size() - 1; while (tiq.size() != 0) { result[count] = tiq.pop(); count--; } return result; }
From source file:Example.lucene.ReadIndex.java
public static void main(String[] args) throws IOException, ParseException { Analyzer analyzer = new ThaiAnalyzer(Version.LUCENE_45); Directory index = FSDirectory.open(new File("data/indexing")); // 2. query//from www. j av a2 s .c om //String querystr = args.length > 0 ? args[0] : "golf user"; // the "title" arg specifies the default field to use // when no field is explicitly specified in the query. //Query q = new MultiFieldQueryParser(Version.LUCENE_45, new String[] {"content"}, analyzer).parse(querystr); //IndexReader indexReader = IndexReader.open(path); IndexReader reader = DirectoryReader.open(index); //IndexSearcher searcher = new IndexSearcher(reader); //Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms("content"); //TermsEnum te = terms.iterator(TermsEnum.EMPTY); Fields fields = MultiFields.getFields(reader); Terms terms = fields.terms("content"); TermsEnum iterator = terms.iterator(null); BytesRef byteRef; while ((byteRef = iterator.next()) != null) { String term = new String(byteRef.bytes, byteRef.offset, byteRef.length); int docFreq = iterator.docFreq(); System.out.println(term + " " + docFreq); } }
From source file:in.geocoder.component.GeocodingComponent.java
License:Apache License
private NamedList<Integer> getTerms(SolrIndexSearcher searcher, IndexSchema schema, String field) throws IOException { NamedList<Object> termsResult = new SimpleOrderedMap<Object>(); boolean sort = true; boolean raw = false; final AtomicReader indexReader = searcher.getAtomicReader(); Fields lfields = indexReader.fields(); NamedList<Integer> fieldTerms = new NamedList<Integer>(); termsResult.add(field, fieldTerms);//from ww w. j av a2 s.c o m Terms terms = lfields == null ? null : lfields.terms(field); if (terms == null) { // no terms for this field return new NamedList<Integer>(); } FieldType ft = raw ? null : schema.getFieldTypeNoEx(field); if (ft == null) ft = new StrField(); TermsEnum termsEnum = terms.iterator(null); BytesRef term = null; term = termsEnum.next(); BoundedTreeSet<CountPair<BytesRef, Integer>> queue = (sort ? new BoundedTreeSet<CountPair<BytesRef, Integer>>(Integer.MAX_VALUE) : null); CharsRef external = new CharsRef(); while (term != null) { boolean externalized = false; // did we fill in "external" yet for this term? // This is a good term in the range. Check if mincount/maxcount conditions are satisfied. int docFreq = termsEnum.docFreq(); // add the term to the list if (sort) { queue.add(new CountPair<BytesRef, Integer>(BytesRef.deepCopyOf(term), docFreq)); } else { // TODO: handle raw somehow if (!externalized) { ft.indexedToReadable(term, external); } fieldTerms.add(external.toString(), docFreq); } term = termsEnum.next(); } if (sort) { for (CountPair<BytesRef, Integer> item : queue) { ft.indexedToReadable(item.key, external); fieldTerms.add(external.toString(), item.val); } } return fieldTerms; }
From source file:indexer.DocVecSequenceFileGenerator.java
long getNumDocs() throws Exception { Fields fields = MultiFields.getFields(reader); return fields.terms(AMI_FIELDS.FIELD_DOC_NAME).size(); }
From source file:indexer.OptimizedRealValuedVecIndexer.java
void processAllTermWise() throws Exception { Cell cell, requantizedCell;//from w ww . j ava2 s . co m copyIndex(); IndexReader currentReader = DirectoryReader.open(writer, true); List<Cell> splitCells = new ArrayList<>(); Fields fields = MultiFields.getFields(reader); Terms terms = fields.terms(DocVector.FIELD_CELL_ID); TermsEnum te = terms.iterator(); // Iterate through every term (a cell docName) and requantize the // points within the cell if required. while (te.next() != null) { String cellId = te.term().utf8ToString(); cell = new Cell(cellId); if (cell.toSplit(reader)) { splitCells.add(cell); List<DocVector> containedPoints = cell.getVectors(currentReader, terms, numDimensions); for (DocVector p : containedPoints) { requantizedCell = cell.quantize(p); // this function returns a new object p.quantize(requantizedCell); // update quantization info (cell docName) Document doc = p.constructDoc(); Term t = new Term(DocVector.FIELD_ID, p.docName); writer.deleteDocuments(t); writer.addDocument(doc); } //Much faster if we don't commit here... //writer.commit(); } } saveSplitCells(writer, splitCells); currentReader.close(); reader.close(); writer.close(); }