Example usage for org.apache.lucene.index DirectoryReader open

List of usage examples for org.apache.lucene.index DirectoryReader open

Introduction

In this page you can find the example usage for org.apache.lucene.index DirectoryReader open.

Prototype

public static DirectoryReader open(final IndexCommit commit) throws IOException 

Source Link

Document

Expert: returns an IndexReader reading the index in the given IndexCommit .

Usage

From source file:de.tudarmstadt.ukp.dkpro.tc.features.ngram.meta.LuceneNGramMetaCollectorTest.java

License:Apache License

@Test
public void luceneNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION, "src/test/resources/data/", TextReader.PARAM_LANGUAGE, "en",
            TextReader.PARAM_PATTERNS, "text*.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory
            .createEngineDescription(LuceneNGramMetaCollector.class, LuceneNGramDFE.PARAM_LUCENE_DIR, tmpDir);

    for (JCas jcas : new JCasIterable(reader, segmenter, metaCollector)) {
        //            System.out.println(jcas.getDocumentText().length());
    }/*from  ww  w .  j a  va 2s . c  o  m*/

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                //                    Bits liveDocs = MultiFields.getLiveDocs(index);
                //                    DocsEnum docs = termsEnum.docs(liveDocs, null);
                //                    int docId;
                //                    while((docId = docs.nextDoc()) != DocsEnum.NO_MORE_DOCS) {
                //                        index.g
                //                    }
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    //                        System.out.println(text.utf8ToString() + " - " + termsEnum.totalTermFreq());
                    //                        System.out.println(termsEnum.docFreq());

                    if (text.utf8ToString().equals("this")) {
                        assertEquals(2, termsEnum.docFreq());
                        assertEquals(3, termsEnum.totalTermFreq());
                    }

                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(35, i);
}

From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.LuceneNGramCPFE.java

License:Apache License

private FrequencyDistribution<String> getTopNgramsCombo(int topNgramThreshold, String fieldName)
        throws ResourceInitializationException {

    FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>();

    MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create();
    IndexReader reader;//from w  ww. ja v  a 2s  . c  o  m
    try {
        reader = DirectoryReader.open(FSDirectory.open(luceneDir));
        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Terms terms = fields.terms(fieldName);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    String term = text.utf8ToString();
                    long freq = termsEnum.totalTermFreq();
                    //add conditions here, like ngram1 is in most freq ngrams1...
                    String combo1 = term.split(ComboUtils.JOINT)[0];
                    String combo2 = term.split(ComboUtils.JOINT)[1];
                    int combinedSize = combo1.split("_").length + combo2.split("_").length;
                    if (topKSetView1.contains(combo1) && topKSet.contains(combo1)
                            && topKSetView2.contains(combo2) && topKSet.contains(combo2)
                            && combinedSize <= ngramMaxNCombo && combinedSize >= ngramMinNCombo) {
                        //print out here for testing
                        topN.add(new TermFreqTuple(term, freq));
                    }
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    int size = topN.size();
    for (int i = 0; i < size; i++) {
        TermFreqTuple tuple = topN.poll();
        // System.out.println(tuple.getTerm() + " - " + tuple.getFreq());
        topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
    }

    return topNGrams;
}

From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.LuceneNGramPFE.java

License:Apache License

private FrequencyDistribution<String> getTopNgrams(int topNgramThreshold, String fieldName)
        throws ResourceInitializationException {

    FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>();

    MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create();
    IndexReader reader;/*  w  ww  .  j a va  2  s .co  m*/
    try {
        reader = DirectoryReader.open(FSDirectory.open(luceneDir));
        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Terms terms = fields.terms(fieldName);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    String term = text.utf8ToString();
                    long freq = termsEnum.totalTermFreq();
                    topN.add(new TermFreqTuple(term, freq));
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    int size = topN.size();
    for (int i = 0; i < size; i++) {
        TermFreqTuple tuple = topN.poll();
        // System.out.println(tuple.getTerm() + " - " + tuple.getFreq());
        topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
    }

    return topNGrams;
}

From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramCPMetaCollectorTest.java

License:Apache License

@Test
public void combinedNgramPairMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class,
            TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AggregateBuilder builder = new AggregateBuilder();
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(
            LuceneNGramCPMetaCollector.class, LuceneNGramCPFE.PARAM_LUCENE_DIR, tmpDir);

    // test fails if for-loop removed
    for (@SuppressWarnings("unused")
    JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
        // System.out.println(jcas.getDocumentText().length());
    }//  ww w.j  ava 2s  .c  o m

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGramCPFE.LUCENE_NGRAM_FIELDCOMBO);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);

                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    // System.out.println(text.utf8ToString() + " - " +
                    // termsEnum.totalTermFreq());
                    // System.out.println(termsEnum.docFreq());

                    // if there were multiple instances of the same ngram,
                    // then this would be relevant
                    if (text.utf8ToString().equals("mice_ANDcats_.")) {
                        assertEquals(1, termsEnum.docFreq());
                        assertEquals(1, termsEnum.totalTermFreq());
                    }
                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(65, i);
}

From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramPMetaCollectorTest.java

License:Apache License

@Test
public void lucenePairNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class,
            TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AggregateBuilder builder = new AggregateBuilder();
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory
            .createEngineDescription(LuceneNGramPMetaCollector.class, LuceneNGramPFE.PARAM_LUCENE_DIR, tmpDir);

    // test fails if for-loop removed
    for (@SuppressWarnings("unused")
    JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
        // System.out.println(jcas.getDocumentText().length());
    }/* w w  w .j  a  v a 2 s  .  c o m*/

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);

                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    // System.out.println(text.utf8ToString() + " - " +
                    // termsEnum.totalTermFreq());
                    // System.out.println(termsEnum.docFreq());

                    if (text.utf8ToString().equals("this")) {
                        assertEquals(2, termsEnum.docFreq());
                        assertEquals(3, termsEnum.totalTermFreq());
                    }

                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(16, i);
}

From source file:de.tudarmstadt.ukp.experiments.argumentation.clustering.debatefiltering.LuceneSearcher.java

License:Apache License

public List<String> retrieveTopNDocs(String textQuery, int topN) throws Exception {
    // Now search the index:
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);

    Directory directory = FSDirectory.open(luceneIndexDir);
    IndexReader reader = DirectoryReader.open(directory);

    IndexSearcher indexSearcher = new IndexSearcher(reader);

    // Parse a simple query
    QueryParser parser = new QueryParser(Version.LUCENE_44, LuceneIndexer.FIELD_TEXT_CONTENT, analyzer);
    Query query = parser.parse(textQuery);

    ScoreDoc[] hits = indexSearcher.search(query, null, topN).scoreDocs;

    List<String> result = new ArrayList<>();

    // Iterate through the results:
    for (int i = 0; i < hits.length; i++) {
        Document hitDoc = indexSearcher.doc(hits[i].doc);
        result.add(hitDoc.getField(LuceneIndexer.FIELD_FILE).stringValue());
        //            System.out.println(hitDoc.toString());
        //                assertEquals("This is the text to be indexed.", hitDoc.get("fieldname"));
    }/* w ww .ja va2s .  c  om*/
    reader.close();
    directory.close();

    return result;
}

From source file:de.twitterlivesearch.analysis.Searcher.java

License:Apache License

/**
 * This is the same as//ww  w . ja va  2s.  c o  m
 * {@link de.twitterlivesearch.analysis.Searcher#searchForTweets(String)
 * searchForTweets(String)}, but the search is limited to the tweet with the
 * given id. This can for example be used to analyze the latest incoming
 * tweet.
 *
 * @param id
 * @param queryString
 * @return
 */
public List<Document> searchForTweets(Integer id, String queryString) {
    if (queryString.isEmpty()) {
        return Collections.emptyList();
    }

    AbstractConfiguration config = ConfigurationHolder.getConfiguration();
    try {
        if (!DirectoryReader.indexExists(directory)) {
            return null;
        }
    } catch (IOException e) {
        log.fatal("Error when trying to check if directory exists!", e);
        return new ArrayList<>();
    }
    DirectoryReader ireader;
    try {
        ireader = DirectoryReader.open(directory);
    } catch (IOException e) {
        log.fatal("Error when trying to open directory!", e);
        return null;
    }

    IndexSearcher isearcher = new IndexSearcher(ireader);
    Query textQuery = null;
    QueryParser parser = new QueryParser(FieldNames.TEXT.getField(),
            AnalyzerMapping.getInstance().ANALYZER_FOR_DELIMITER);
    parser.setDefaultOperator(config.getDefaultOperator());
    BooleanQuery query = new BooleanQuery();
    try {
        textQuery = parser.parse(queryString);
    } catch (ParseException e) {
        log.fatal("Error while parsing query: " + queryString, e);
    }

    // if id does not equal null only the query with the given id will be
    // searched
    // this can be used to search the latest element only
    if (id != null) {
        Query idQuery = NumericRangeQuery.newIntRange(FieldNames.ID.getField(), id.intValue(), id.intValue(),
                true, true);
        query.add(idQuery, Occur.MUST);
    }
    query.add(textQuery, Occur.MUST);
    ScoreDoc[] hits = null;
    try {
        hits = isearcher.search(query, 1000).scoreDocs;
    } catch (IOException e) {
        log.fatal("Error while trying to search!", e);
    }
    List<Document> result = new ArrayList<>();
    for (int i = 0; i < hits.length; i++) {
        try {
            result.add(isearcher.doc(hits[i].doc));
            log.info("Found result for query \"" + queryString + "\".");
        } catch (IOException e) {
            log.fatal("Error when getting document!", e);
        }
    }
    return result;
}

From source file:de.unihildesheim.iw.cli.DumpIPCs.java

License:Open Source License

private void runMain(final String... args) throws IOException, BuildException {
    new CmdLineParser(this.cliParams);
    parseWithHelp(this.cliParams, args);

    // check, if files and directories are sane
    this.cliParams.check();

    assert this.cliParams.idxReader != null;
    final int maxDoc = this.cliParams.idxReader.maxDoc();
    if (maxDoc == 0) {
        LOG.error("Empty index.");
        return;/*from  w  w w.jav a 2  s.com*/
    }

    final Parser ipcParser = new Parser();
    ipcParser.separatorChar(this.cliParams.sep);
    ipcParser.allowZeroPad(this.cliParams.zeroPad);

    final DirectoryReader reader = DirectoryReader.open(FSDirectory.open(this.cliParams.idxDir.toPath()));
    final Builder idxReaderBuilder = new Builder(reader);

    Pattern rx_ipc = null;

    if (this.cliParams.ipc != null) {
        final IPCRecord ipc = ipcParser.parse(this.cliParams.ipc);
        final BooleanQuery bq = new BooleanQuery();
        rx_ipc = Pattern.compile(ipc.toRegExpString(this.cliParams.sep));
        if (LOG.isDebugEnabled()) {
            LOG.debug("IPC regExp: rx={} pat={}", ipc.toRegExpString(this.cliParams.sep), rx_ipc);
        }

        bq.add(new QueryWrapperFilter(IPCClassQuery.get(ipc, this.cliParams.sep)), Occur.MUST);
        bq.add(new QueryWrapperFilter(
                new IPCFieldFilter(new IPCFieldFilterFunctions.SloppyMatch(ipc), ipcParser)), Occur.MUST);
        idxReaderBuilder.queryFilter(new QueryWrapperFilter(bq));
    }

    final IndexReader idxReader = idxReaderBuilder.build();

    if (idxReader.numDocs() > 0) {
        final Terms terms = MultiFields.getTerms(idxReader, LUCENE_CONF.FLD_IPC);
        TermsEnum termsEnum = TermsEnum.EMPTY;
        BytesRef term;
        if (terms != null) {
            termsEnum = terms.iterator(termsEnum);
            term = termsEnum.next();

            final int[] count = { 0, 0 }; // match, exclude
            while (term != null) {
                final String code = term.utf8ToString();
                if (rx_ipc == null || (rx_ipc.matcher(code).matches())) {
                    final IPCRecord record = ipcParser.parse(code);
                    try {
                        System.out.println(code + ' ' + record + " (" + record.toFormattedString() + ") " + '['
                                + record.toRegExpString('-') + ']');
                    } catch (final IllegalArgumentException e) {
                        System.out.println(code + ' ' + "INVALID (" + code + ')');
                    }
                    count[0]++;
                } else {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Skip non matching IPC: {}", code);
                    }
                    count[1]++;
                }
                term = termsEnum.next();
            }
            LOG.info("match={} skip={}", count[0], count[1]);
        }
    } else {
        LOG.info("No documents left after filtering.");
    }
}

From source file:de.unihildesheim.iw.lucene.index.FDRIndexDataProviderTest.java

License:Open Source License

@Test
public void testBuilder_tvIndex() throws Exception {
    try (TestMemIndex idx = new TestMemIndex(Index.TVECTORS)) {
        final DirectoryReader reader = DirectoryReader.open(idx.dir);
        final FilteredDirectoryReader idxReader = new FilteredDirectoryReader.Builder(reader).build();
        new FDRIndexDataProvider.Builder().indexReader(idxReader).build();
    }//from   w  w  w  .j  a v a 2s .  c o m
}

From source file:de.unihildesheim.iw.lucene.index.FDRIndexDataProviderTest.java

License:Open Source License

@Test
public void testBuilder_noTvIndex() throws Exception {
    try (TestMemIndex idx = new TestMemIndex(Index.NO_TVECTORS)) {
        final DirectoryReader reader = DirectoryReader.open(idx.dir);
        final FilteredDirectoryReader idxReader = new FilteredDirectoryReader.Builder(reader).build();
        try {/*w ww.j a va  2s . c o  m*/
            new FDRIndexDataProvider.Builder().indexReader(idxReader).build();
            Assert.fail("Expected an IllegalStateException to be thrown.");
        } catch (final IllegalStateException e) {
            // pass
        }
    }
}