Example usage for org.apache.lucene.util BytesRef utf8ToString

List of usage examples for org.apache.lucene.util BytesRef utf8ToString

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef utf8ToString.

Prototype

public String utf8ToString() 

Source Link

Document

Interprets stored bytes as UTF8 bytes, returning the resulting string

Usage

From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.LuceneNGramCPFE.java

License:Apache License

private FrequencyDistribution<String> getTopNgramsCombo(int topNgramThreshold, String fieldName)
        throws ResourceInitializationException {

    FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>();

    MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create();
    IndexReader reader;/*from w w w .j  a v a  2  s  .c  o  m*/
    try {
        reader = DirectoryReader.open(FSDirectory.open(luceneDir));
        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Terms terms = fields.terms(fieldName);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    String term = text.utf8ToString();
                    long freq = termsEnum.totalTermFreq();
                    //add conditions here, like ngram1 is in most freq ngrams1...
                    String combo1 = term.split(ComboUtils.JOINT)[0];
                    String combo2 = term.split(ComboUtils.JOINT)[1];
                    int combinedSize = combo1.split("_").length + combo2.split("_").length;
                    if (topKSetView1.contains(combo1) && topKSet.contains(combo1)
                            && topKSetView2.contains(combo2) && topKSet.contains(combo2)
                            && combinedSize <= ngramMaxNCombo && combinedSize >= ngramMinNCombo) {
                        //print out here for testing
                        topN.add(new TermFreqTuple(term, freq));
                    }
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    int size = topN.size();
    for (int i = 0; i < size; i++) {
        TermFreqTuple tuple = topN.poll();
        // System.out.println(tuple.getTerm() + " - " + tuple.getFreq());
        topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
    }

    return topNGrams;
}

From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.LuceneNGramPFE.java

License:Apache License

private FrequencyDistribution<String> getTopNgrams(int topNgramThreshold, String fieldName)
        throws ResourceInitializationException {

    FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>();

    MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(topNgramThreshold).create();
    IndexReader reader;/*from   w w  w  .j  ava  2s.  c o  m*/
    try {
        reader = DirectoryReader.open(FSDirectory.open(luceneDir));
        Fields fields = MultiFields.getFields(reader);
        if (fields != null) {
            Terms terms = fields.terms(fieldName);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);
                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    String term = text.utf8ToString();
                    long freq = termsEnum.totalTermFreq();
                    topN.add(new TermFreqTuple(term, freq));
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    int size = topN.size();
    for (int i = 0; i < size; i++) {
        TermFreqTuple tuple = topN.poll();
        // System.out.println(tuple.getTerm() + " - " + tuple.getFreq());
        topNGrams.addSample(tuple.getTerm(), tuple.getFreq());
    }

    return topNGrams;
}

From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramCPMetaCollectorTest.java

License:Apache License

@Test
public void combinedNgramPairMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class,
            TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AggregateBuilder builder = new AggregateBuilder();
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory.createEngineDescription(
            LuceneNGramCPMetaCollector.class, LuceneNGramCPFE.PARAM_LUCENE_DIR, tmpDir);

    // test fails if for-loop removed
    for (@SuppressWarnings("unused")
    JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
        // System.out.println(jcas.getDocumentText().length());
    }/*from w ww . ja  v  a  2 s .com*/

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGramCPFE.LUCENE_NGRAM_FIELDCOMBO);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);

                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    // System.out.println(text.utf8ToString() + " - " +
                    // termsEnum.totalTermFreq());
                    // System.out.println(termsEnum.docFreq());

                    // if there were multiple instances of the same ngram,
                    // then this would be relevant
                    if (text.utf8ToString().equals("mice_ANDcats_.")) {
                        assertEquals(1, termsEnum.docFreq());
                        assertEquals(1, termsEnum.totalTermFreq());
                    }
                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(65, i);
}

From source file:de.tudarmstadt.ukp.dkpro.tc.features.pair.core.ngram.meta.LuceneNGramPMetaCollectorTest.java

License:Apache License

@Test
public void lucenePairNgramMetaCollectorTest() throws Exception {
    File tmpDir = folder.newFolder();

    CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(TestPairReader.class,
            TestPairReader.PARAM_INPUT_FILE, "src/test/resources/data/textpairs.txt");

    AnalysisEngineDescription segmenter = AnalysisEngineFactory
            .createEngineDescription(BreakIteratorSegmenter.class);

    AggregateBuilder builder = new AggregateBuilder();
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_ONE);
    builder.add(segmenter, Constants.INITIAL_VIEW, Constants.PART_TWO);

    AnalysisEngineDescription metaCollector = AnalysisEngineFactory
            .createEngineDescription(LuceneNGramPMetaCollector.class, LuceneNGramPFE.PARAM_LUCENE_DIR, tmpDir);

    // test fails if for-loop removed
    for (@SuppressWarnings("unused")
    JCas jcas : new JCasIterable(reader, builder.createAggregateDescription(), metaCollector)) {
        // System.out.println(jcas.getDocumentText().length());
    }/*w w w  . j av  a 2s  .  co m*/

    int i = 0;
    IndexReader index;
    try {
        index = DirectoryReader.open(FSDirectory.open(tmpDir));
        Fields fields = MultiFields.getFields(index);
        if (fields != null) {
            Terms terms = fields.terms(LuceneNGramDFE.LUCENE_NGRAM_FIELD);
            if (terms != null) {
                TermsEnum termsEnum = terms.iterator(null);

                BytesRef text = null;
                while ((text = termsEnum.next()) != null) {
                    // System.out.println(text.utf8ToString() + " - " +
                    // termsEnum.totalTermFreq());
                    // System.out.println(termsEnum.docFreq());

                    if (text.utf8ToString().equals("this")) {
                        assertEquals(2, termsEnum.docFreq());
                        assertEquals(3, termsEnum.totalTermFreq());
                    }

                    i++;
                }
            }
        }
    } catch (Exception e) {
        throw new ResourceInitializationException(e);
    }

    assertEquals(16, i);
}

From source file:de.unihildesheim.iw.cli.DumpCommonTerms.java

License:Open Source License

/**
 * Class setup./* w w w  . j  a va  2s  . c om*/
 *
 * @param args Commandline arguments.
 * @throws IOException Thrown on low-level i/o-errors
 */
@SuppressWarnings("ObjectAllocationInLoop")
@SuppressFBWarnings("SEC_SIDE_EFFECT_CONSTRUCTOR")
private void runMain(final String... args) throws IOException {
    new CmdLineParser(this.cliParams);
    parseWithHelp(this.cliParams, args);

    // check, if files and directories are sane
    this.cliParams.check();

    LOG.info("Writing terms to '{}'.", this.cliParams.targetFile);

    assert this.cliParams.lang != null;
    assert this.cliParams.stopFilePattern != null;
    final Set<String> sWords = CliCommon.getStopwords(this.cliParams.lang, this.cliParams.stopFileFormat,
            this.cliParams.stopFilePattern);

    assert this.cliParams.idxReader != null;
    final int maxDoc = this.cliParams.idxReader.maxDoc();
    if (maxDoc == 0) {
        LOG.error("Empty index.");
        return;
    }

    final Terms terms = MultiFields.getTerms(this.cliParams.idxReader, this.cliParams.field);
    TermsEnum termsEnum = TermsEnum.EMPTY;
    BytesRef term;

    if (terms != null) {
        termsEnum = terms.iterator(termsEnum);
        term = termsEnum.next();

        assert this.cliParams.targetFile != null;
        try (CSVWriter csvWriter = new CSVWriter(new OutputStreamWriter(
                new FileOutputStream(this.cliParams.targetFile), StandardCharsets.UTF_8))) {

            // write header line
            csvWriter.writeNext(new String[] { "term", "relDF" });

            while (term != null) {
                final String termStr = term.utf8ToString();
                if (!sWords.contains(termStr.toLowerCase())) {
                    final double docFreq = (double) termsEnum.docFreq();
                    if (docFreq > 0d) {
                        final double relDocFreq = docFreq / (double) maxDoc;

                        if (relDocFreq > this.cliParams.threshold) {
                            // log term
                            csvWriter.writeNext(new String[] { termStr,
                                    // make exponential string R compatible
                                    Double.toString(relDocFreq).toLowerCase() });
                        }
                    }
                }
                term = termsEnum.next();
            }
        }
    }
}

From source file:de.unihildesheim.iw.cli.DumpIPCs.java

License:Open Source License

private void runMain(final String... args) throws IOException, BuildException {
    new CmdLineParser(this.cliParams);
    parseWithHelp(this.cliParams, args);

    // check, if files and directories are sane
    this.cliParams.check();

    assert this.cliParams.idxReader != null;
    final int maxDoc = this.cliParams.idxReader.maxDoc();
    if (maxDoc == 0) {
        LOG.error("Empty index.");
        return;/*w  w  w  . j a v a 2s  .co m*/
    }

    final Parser ipcParser = new Parser();
    ipcParser.separatorChar(this.cliParams.sep);
    ipcParser.allowZeroPad(this.cliParams.zeroPad);

    final DirectoryReader reader = DirectoryReader.open(FSDirectory.open(this.cliParams.idxDir.toPath()));
    final Builder idxReaderBuilder = new Builder(reader);

    Pattern rx_ipc = null;

    if (this.cliParams.ipc != null) {
        final IPCRecord ipc = ipcParser.parse(this.cliParams.ipc);
        final BooleanQuery bq = new BooleanQuery();
        rx_ipc = Pattern.compile(ipc.toRegExpString(this.cliParams.sep));
        if (LOG.isDebugEnabled()) {
            LOG.debug("IPC regExp: rx={} pat={}", ipc.toRegExpString(this.cliParams.sep), rx_ipc);
        }

        bq.add(new QueryWrapperFilter(IPCClassQuery.get(ipc, this.cliParams.sep)), Occur.MUST);
        bq.add(new QueryWrapperFilter(
                new IPCFieldFilter(new IPCFieldFilterFunctions.SloppyMatch(ipc), ipcParser)), Occur.MUST);
        idxReaderBuilder.queryFilter(new QueryWrapperFilter(bq));
    }

    final IndexReader idxReader = idxReaderBuilder.build();

    if (idxReader.numDocs() > 0) {
        final Terms terms = MultiFields.getTerms(idxReader, LUCENE_CONF.FLD_IPC);
        TermsEnum termsEnum = TermsEnum.EMPTY;
        BytesRef term;
        if (terms != null) {
            termsEnum = terms.iterator(termsEnum);
            term = termsEnum.next();

            final int[] count = { 0, 0 }; // match, exclude
            while (term != null) {
                final String code = term.utf8ToString();
                if (rx_ipc == null || (rx_ipc.matcher(code).matches())) {
                    final IPCRecord record = ipcParser.parse(code);
                    try {
                        System.out.println(code + ' ' + record + " (" + record.toFormattedString() + ") " + '['
                                + record.toRegExpString('-') + ']');
                    } catch (final IllegalArgumentException e) {
                        System.out.println(code + ' ' + "INVALID (" + code + ')');
                    }
                    count[0]++;
                } else {
                    if (LOG.isDebugEnabled()) {
                        LOG.debug("Skip non matching IPC: {}", code);
                    }
                    count[1]++;
                }
                term = termsEnum.next();
            }
            LOG.info("match={} skip={}", count[0], count[1]);
        }
    } else {
        LOG.info("No documents left after filtering.");
    }
}

From source file:de.unihildesheim.iw.cli.DumpTermData.java

License:Open Source License

/**
 * Class setup./*  w  w  w .  j  a v  a  2  s.  co m*/
 *
 * @param args Commandline arguments.
 * @throws IOException Thrown on low-level i/o-errors
 * @throws ClassNotFoundException Thrown if JDBC driver could not be loaded
 * @throws SQLException Thrown, if connection to the database has failed
 * @throws BuildException Thrown, if building a {@link
 * FilteredDirectoryReader} instance has failed
 */
@SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess")
private void runMain(final String... args)
        throws IOException, SQLException, ClassNotFoundException, BuildException {
    new CmdLineParser(this.cliParams);
    parseWithHelp(this.cliParams, args);

    // check, if files and directories are sane
    this.cliParams.check();

    LOG.info("Writing term-data to '{}'.", this.cliParams.dbFile);

    // table manager instance: Target database for term data
    try (final TermDataDB db = new TermDataDB(this.cliParams.dbFile)) {
        // create meta & data table
        final Table termsTable;
        if (this.cliParams.ipcRec == null) {
            termsTable = new TermsTable();
        } else {
            termsTable = new TermsTable(
                    // include optional IPC field
                    TermsTable.FieldsOptional.IPC);
        }
        final Table metaTable = new MetaTable();
        db.createTables(termsTable, metaTable);

        try (final TermsTable.Writer dataWriter = new TermsTable.Writer(db.getConnection())) {

            // write meta-data
            try (final MetaTable.Writer metaWriter = new MetaTable.Writer(db.getConnection())) {
                metaWriter.addContent(new TableFieldContent(metaTable)
                        .setValue(MetaTable.Fields.TABLE_NAME, termsTable.getName())
                        .setValue(MetaTable.Fields.CMD, StringUtils.join(args, " ")));
            }

            final Set<String> sWords;
            if (this.cliParams.stopFilePattern != null) {
                sWords = CliCommon.getStopwords(this.cliParams.lang, this.cliParams.stopFileFormat,
                        this.cliParams.stopFilePattern);
            } else {
                sWords = Collections.emptySet();
            }

            final int maxDoc = this.cliParams.idxReader.maxDoc();
            if (maxDoc == 0) {
                LOG.error("Empty index.");
                return;
            }

            final Terms terms = MultiFields.getTerms(this.cliParams.idxReader, this.cliParams.field);
            TermsEnum termsEnum = TermsEnum.EMPTY;
            BytesRef term;

            if (terms != null) {
                termsEnum = terms.iterator(termsEnum);
                term = termsEnum.next();
                final AtomicLong count = new AtomicLong(0L);

                @SuppressWarnings("AnonymousInnerClassMayBeStatic")
                final TaskObserver obs = new TaskObserver(new TaskObserverMessage() {
                    @Override
                    public void call(@NotNull final TimeMeasure tm) {
                        LOG.info("Collected {} terms after {}.",
                                NumberFormat.getIntegerInstance().format(count.get()), tm.getTimeString());
                    }
                }).start();

                // normalize some parameters
                final String fieldName = StringUtils.lowerCase(this.cliParams.field);
                final String langName = StringUtils.lowerCase(this.cliParams.lang);

                while (term != null) {
                    final String termStr = term.utf8ToString();
                    if (!sWords.contains(termStr.toLowerCase())) {
                        final double docFreq = (double) termsEnum.docFreq();
                        if (docFreq > 0d) {
                            final double relDocFreq = docFreq / (double) maxDoc;

                            if (relDocFreq > this.cliParams.threshold) {
                                @SuppressWarnings("ObjectAllocationInLoop")
                                final TableFieldContent tfc = new TableFieldContent(termsTable);
                                tfc.setValue(TermsTable.Fields.TERM, termStr);
                                tfc.setValue(TermsTable.Fields.DOCFREQ_REL, relDocFreq);
                                tfc.setValue(TermsTable.Fields.DOCFREQ_ABS, docFreq);
                                tfc.setValue(TermsTable.Fields.LANG, langName);
                                tfc.setValue(TermsTable.Fields.FIELD, fieldName);
                                if (this.cliParams.ipcRec != null) {
                                    tfc.setValue(TermsTable.FieldsOptional.IPC,
                                            this.cliParams.ipcRec.toFormattedString());
                                }
                                dataWriter.addContent(tfc, false);
                                count.incrementAndGet();
                            }
                        }
                    }
                    term = termsEnum.next();
                }
                obs.stop();
                LOG.info("Total of {} terms collected.", NumberFormat.getIntegerInstance().format(count));
            }
        }
    }
}

From source file:de.unihildesheim.iw.lucene.analyzer.EnglishAnalyzerTest.java

License:Open Source License

@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//from   w  w  w  .  jav a2  s. c  om
public void testTokenStream_noStopwords() throws Exception {
    final String query = "foo bar baz bam";
    final Analyzer analyzer = new EnglishAnalyzer();
    final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = analyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    }

    Assert.assertEquals("Not all terms returned.", 4L, result.size());

    final BytesRefIterator bri = result.iterator();
    BytesRef term;
    while ((term = bri.next()) != null) {
        Assert.assertTrue("Unknown term found.",
                "foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString())
                        || "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
    }
}

From source file:de.unihildesheim.iw.lucene.analyzer.EnglishAnalyzerTest.java

License:Open Source License

@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test/*from   w  ww .  jav a 2s. c o  m*/
public void testTokenStream() throws Exception {
    final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
    final String query = "foo bar baz bam";
    final Analyzer analyzer = new EnglishAnalyzer(csa);
    final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = analyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    }

    Assert.assertEquals("Not all terms returned.", 2L, result.size());

    final BytesRefIterator bri = result.iterator();
    BytesRef term;
    while ((term = bri.next()) != null) {
        Assert.assertTrue("Unknown term found.",
                "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
    }
}

From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java

License:Open Source License

@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//from   www . j  a va2 s.co  m
public void testTokenStream_noStopwords() throws Exception {
    final String query = "foo bar baz bam";
    final Analyzer analyzer = new FrenchAnalyzer();
    final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = analyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    }

    Assert.assertEquals("Not all terms returned.", 4L, result.size());

    final BytesRefIterator bri = result.iterator();
    BytesRef term;
    while ((term = bri.next()) != null) {
        Assert.assertTrue("Unknown term found.",
                "foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString())
                        || "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
    }
}