List of usage examples for org.apache.lucene.index DirectoryReader open
public static DirectoryReader open(final IndexCommit commit) throws IOException
From source file:de.tudarmstadt.ukp.dkpro.tc.features.ngram.meta.LuceneNGramMetaCollectorTest.java
License:Apache License
@Test public void luceneNgramMetaCollectorTest() throws Exception { File tmpDir = folder.newFolder(); CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class, TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en", TextReader.PARAM_PATTERNS, "text*.txt"); AnalysisEngineDescription segmenter = AnalysisEngineFactory .createEngineDescription(BreakIteratorSegmenter.class); AnalysisEngineDescription metaCollector = AnalysisEngineFactory .createEngineDescription(LuceneNGramMetaCollector.class, LuceneNGramDFE.PARAM_LUCENE_DIR, tmpDir); for (JCas jcas : new JCasIterable(reader, segmenter, metaCollector)) { // System.out.println(jcas.getDocumentText().length()); }/*from ww w . j a va 2s . c o m*/ int i = 0; IndexReader index; try { index = DirectoryReader.open(FSDirectory.open(tmpDir)); Fields fields = MultiFields.getFields(index); if (fields != null) { Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); // Bits liveDocs = MultiFields.getLiveDocs(index); // DocsEnum docs = termsEnum.docs(liveDocs, null); // int docId; // while((docId = docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) { // index.g // } BytesRef text = null; while ((text = termsEnum.next()) != null) { // System.out.println(text.utf8ToString() + " - " + termsEnum.totalTermFreq()); // System.out.println(termsEnum.docFreq()); if (text.utf8ToString().equals("this")) { assertEquals(2, termsEnum.docFreq()); assertEquals(3, termsEnum.totalTermFreq()); } i++; } } } } catch (Exception e) { throw new ResourceInitializationException(e); } assertEquals(35, i); }
From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.LuceneNGramCPFE.java
License:Apache License
private FrequencyDistribution<String> getTopNgramsCombo(int topNgramThreshold, String fieldName) throws ResourceInitializationException { FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>(); MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create(); IndexReader reader;//from w ww. ja v a 2s . c o m try { reader = DirectoryReader.open(FSDirectory.open(luceneDir)); Fields fields = MultiFields.getFields(reader); if (fields != null) { Terms terms = fields.terms(fieldName); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); long freq = termsEnum.totalTermFreq(); //add conditions here, like ngram1 is in most freq ngrams1... String combo1 = term.split(ComboUtils.JOINT)[0]; String combo2 = term.split(ComboUtils.JOINT)[1]; int combinedSize = combo1.split("_").length + combo2.split("_").length; if (topKSetView1.contains(combo1) && topKSet.contains(combo1) && topKSetView2.contains(combo2) && topKSet.contains(combo2) && combinedSize <= ngramMaxNCombo && combinedSize >= ngramMinNCombo) { //print out here for testing topN.add(new TermFreqTuple(term, freq)); } } } } } catch (Exception e) { throw new ResourceInitializationException(e); } int size = topN.size(); for (int i = 0; i < size; i++) { TermFreqTuple tuple = topN.poll(); // System.out.println(tuple.getTerm() + " - " + tuple.getFreq()); topNGrams.addSample(tuple.getTerm(), tuple.getFreq()); } return topNGrams; }
From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.LuceneNGramPFE.java
License:Apache License
private FrequencyDistribution<String> getTopNgrams(int topNgramThreshold, String fieldName) throws ResourceInitializationException { FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>(); MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create(); IndexReader reader;/* w ww . j a va 2 s .co m*/ try { reader = DirectoryReader.open(FSDirectory.open(luceneDir)); Fields fields = MultiFields.getFields(reader); if (fields != null) { Terms terms = fields.terms(fieldName); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); long freq = termsEnum.totalTermFreq(); topN.add(new TermFreqTuple(term, freq)); } } } } catch (Exception e) { throw new ResourceInitializationException(e); } int size = topN.size(); for (int i = 0; i < size; i++) { TermFreqTuple tuple = topN.poll(); // System.out.println(tuple.getTerm() + " - " + tuple.getFreq()); topNGrams.addSample(tuple.getTerm(), tuple.getFreq()); } return topNGrams; }
From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramCPMetaCollectorTest.java
License:Apache License
@Test public void combinedNgramPairMetaCollectorTest() throws Exception { File tmpDir = folder.newFolder(); CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt"); AnalysisEngineDescription segmenter = AnalysisEngineFactory .createEngineDescription(BreakIteratorSegmenter.class); AggregateBuilder builder = new AggregateBuilder(); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO); AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription( LuceneNGramCPMetaCollector.class, LuceneNGramCPFE.PARAM_LUCENE_DIR, tmpDir); // test fails if for-loop removed for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) { // System.out.println(jcas.getDocumentText().length()); }// ww w.j ava 2s .c o m int i = 0; IndexReader index; try { index = DirectoryReader.open(FSDirectory.open(tmpDir)); Fields fields = MultiFields.getFields(index); if (fields != null) { Terms terms = fields.terms(LuceneNGramCPFE.LUCENE_NGRAM_FIELDCOMBO); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { // System.out.println(text.utf8ToString() + " - " + // termsEnum.totalTermFreq()); // System.out.println(termsEnum.docFreq()); // if there were multiple instances of the same ngram, // then this would be relevant if (text.utf8ToString().equals("mice_ANDcats_.")) { assertEquals(1, termsEnum.docFreq()); assertEquals(1, termsEnum.totalTermFreq()); } i++; } } } } catch (Exception e) { throw new ResourceInitializationException(e); } assertEquals(65, i); }
From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramPMetaCollectorTest.java
License:Apache License
@Test public void lucenePairNgramMetaCollectorTest() throws Exception { File tmpDir = folder.newFolder(); CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class, TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt"); AnalysisEngineDescription segmenter = AnalysisEngineFactory .createEngineDescription(BreakIteratorSegmenter.class); AggregateBuilder builder = new AggregateBuilder(); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE); builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO); AnalysisEngineDescription metaCollector = AnalysisEngineFactory .createEngineDescription(LuceneNGramPMetaCollector.class, LuceneNGramPFE.PARAM_LUCENE_DIR, tmpDir); // test fails if for-loop removed for (@SuppressWarnings("unused") JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) { // System.out.println(jcas.getDocumentText().length()); }/* w w w .j a v a 2 s . c o m*/ int i = 0; IndexReader index; try { index = DirectoryReader.open(FSDirectory.open(tmpDir)); Fields fields = MultiFields.getFields(index); if (fields != null) { Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { // System.out.println(text.utf8ToString() + " - " + // termsEnum.totalTermFreq()); // System.out.println(termsEnum.docFreq()); if (text.utf8ToString().equals("this")) { assertEquals(2, termsEnum.docFreq()); assertEquals(3, termsEnum.totalTermFreq()); } i++; } } } } catch (Exception e) { throw new ResourceInitializationException(e); } assertEquals(16, i); }
From source file:de.tudarmstadt.ukp.experiments.argumentation.clustering.debatefiltering.LuceneSearcher.java
License:Apache License
public List<String> retrieveTopNDocs(String textQuery, int topN) throws Exception { // Now search the index: Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); Directory directory = FSDirectory.open(luceneIndexDir); IndexReader reader = DirectoryReader.open(directory); IndexSearcher indexSearcher = new IndexSearcher(reader); // Parse a simple query QueryParser parser = new QueryParser(Version.LUCENE_44, LuceneIndexer.FIELD_TEXT_CONTENT, analyzer); Query query = parser.parse(textQuery); ScoreDoc[] hits = indexSearcher.search(query, null, topN).scoreDocs; List<String> result = new ArrayList<>(); // Iterate through the results: for (int i = 0; i < hits.length; i++) { Document hitDoc = indexSearcher.doc(hits[i].doc); result.add(hitDoc.getField(LuceneIndexer.FIELD_FILE).stringValue()); // System.out.println(hitDoc.toString()); // assertEquals("This is the text to be indexed.", hitDoc.get("fieldname")); }/* w ww .ja va2s . c om*/ reader.close(); directory.close(); return result; }
From source file:de.twitterlivesearch.analysis.Searcher.java
License:Apache License
/** * This is the same as//ww w . ja va 2s. c o m * {@link de.twitterlivesearch.analysis.Searcher#searchForTweets(String) * searchForTweets(String)}, but the search is limited to the tweet with the * given id. This can for example be used to analyze the latest incoming * tweet. * * @param id * @param queryString * @return */ public List<Document> searchForTweets(Integer id, String queryString) { if (queryString.isEmpty()) { return Collections.emptyList(); } AbstractConfiguration config = ConfigurationHolder.getConfiguration(); try { if (!DirectoryReader.indexExists(directory)) { return null; } } catch (IOException e) { log.fatal("Error when trying to check if directory exists!", e); return new ArrayList<>(); } DirectoryReader ireader; try { ireader = DirectoryReader.open(directory); } catch (IOException e) { log.fatal("Error when trying to open directory!", e); return null; } IndexSearcher isearcher = new IndexSearcher(ireader); Query textQuery = null; QueryParser parser = new QueryParser(FieldNames.TEXT.getField(), AnalyzerMapping.getInstance().ANALYZER_FOR_DELIMITER); parser.setDefaultOperator(config.getDefaultOperator()); BooleanQuery query = new BooleanQuery(); try { textQuery = parser.parse(queryString); } catch (ParseException e) { log.fatal("Error while parsing query: " + queryString, e); } // if id does not equal null only the query with the given id will be // searched // this can be used to search the latest element only if (id != null) { Query idQuery = NumericRangeQuery.newIntRange(FieldNames.ID.getField(), id.intValue(), id.intValue(), true, true); query.add(idQuery, Occur.MUST); } query.add(textQuery, Occur.MUST); ScoreDoc[] hits = null; try { hits = isearcher.search(query, 1000).scoreDocs; } catch (IOException e) { log.fatal("Error while trying to search!", e); } List<Document> result = new ArrayList<>(); for (int i = 0; i < hits.length; i++) { try { result.add(isearcher.doc(hits[i].doc)); log.info("Found result for query \"" + queryString + "\"."); } catch (IOException e) { log.fatal("Error when getting document!", e); } } return result; }
From source file:de.unihildesheim.iw.cli.DumpIPCs.java
License:Open Source License
private void runMain(final String... args) throws IOException, BuildException { new CmdLineParser(this.cliParams); parseWithHelp(this.cliParams, args); // check, if files and directories are sane this.cliParams.check(); assert this.cliParams.idxReader != null; final int maxDoc = this.cliParams.idxReader.maxDoc(); if (maxDoc == 0) { LOG.error("Empty index."); return;/*from w w w.jav a 2 s.com*/ } final Parser ipcParser = new Parser(); ipcParser.separatorChar(this.cliParams.sep); ipcParser.allowZeroPad(this.cliParams.zeroPad); final DirectoryReader reader = DirectoryReader.open(FSDirectory.open(this.cliParams.idxDir.toPath())); final Builder idxReaderBuilder = new Builder(reader); Pattern rx_ipc = null; if (this.cliParams.ipc != null) { final IPCRecord ipc = ipcParser.parse(this.cliParams.ipc); final BooleanQuery bq = new BooleanQuery(); rx_ipc = Pattern.compile(ipc.toRegExpString(this.cliParams.sep)); if (LOG.isDebugEnabled()) { LOG.debug("IPC regExp: rx={} pat={}", ipc.toRegExpString(this.cliParams.sep), rx_ipc); } bq.add(new QueryWrapperFilter(IPCClassQuery.get(ipc, this.cliParams.sep)), Occur.MUST); bq.add(new QueryWrapperFilter( new IPCFieldFilter(new IPCFieldFilterFunctions.SloppyMatch(ipc), ipcParser)), Occur.MUST); idxReaderBuilder.queryFilter(new QueryWrapperFilter(bq)); } final IndexReader idxReader = idxReaderBuilder.build(); if (idxReader.numDocs() > 0) { final Terms terms = MultiFields.getTerms(idxReader, LUCENE_CONF.FLD_IPC); TermsEnum termsEnum = TermsEnum.EMPTY; BytesRef term; if (terms != null) { termsEnum = terms.iterator(termsEnum); term = termsEnum.next(); final int[] count = { 0, 0 }; // match, exclude while (term != null) { final String code = term.utf8ToString(); if (rx_ipc == null || (rx_ipc.matcher(code).matches())) { final IPCRecord record = ipcParser.parse(code); try { System.out.println(code + ' ' + record + " (" + record.toFormattedString() + ") " + '[' + record.toRegExpString('-') + ']'); } catch (final IllegalArgumentException e) { System.out.println(code + ' ' + "INVALID (" + code + ')'); } count[0]++; } else { if (LOG.isDebugEnabled()) { LOG.debug("Skip non matching IPC: {}", code); } count[1]++; } term = termsEnum.next(); } LOG.info("match={} skip={}", count[0], count[1]); } } else { LOG.info("No documents left after filtering."); } }
From source file:de.unihildesheim.iw.lucene.index.FDRIndexDataProviderTest.java
License:Open Source License
@Test public void testBuilder_tvIndex() throws Exception { try (TestMemIndex idx = new TestMemIndex(Index.TVECTORS)) { final DirectoryReader reader = DirectoryReader.open(idx.dir); final FilteredDirectoryReader idxReader = new FilteredDirectoryReader.Builder(reader).build(); new FDRIndexDataProvider.Builder().indexReader(idxReader).build(); }//from w w w .j a v a 2s . c o m }
From source file:de.unihildesheim.iw.lucene.index.FDRIndexDataProviderTest.java
License:Open Source License
@Test public void testBuilder_noTvIndex() throws Exception { try (TestMemIndex idx = new TestMemIndex(Index.NO_TVECTORS)) { final DirectoryReader reader = DirectoryReader.open(idx.dir); final FilteredDirectoryReader idxReader = new FilteredDirectoryReader.Builder(reader).build(); try {/*w ww.j a va 2s . c o m*/ new FDRIndexDataProvider.Builder().indexReader(idxReader).build(); Assert.fail("Expected an IllegalStateException to be thrown."); } catch (final IllegalStateException e) { // pass } } }