Example usage for org.apache.lucene.util BytesRef utf8ToString

List of usage examples for org.apache.lucene.util BytesRef utf8ToString

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef utf8ToString.

Prototype

public String utf8ToString() 

Source Link

Document

Interprets stored bytes as UTF8 bytes, returning the resulting string

Usage

From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java

License:Open Source License

@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test/*from   ww w . ja  va2  s  . c  o m*/
public void testTokenStream() throws Exception {
    final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
    final String query = "foo bar baz bam";
    final Analyzer analyzer = new FrenchAnalyzer(csa);
    final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = analyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    }

    Assert.assertEquals("Not all terms returned.", 2L, result.size());

    final BytesRefIterator bri = result.iterator();
    BytesRef term;
    while ((term = bri.next()) != null) {
        Assert.assertTrue("Unknown term found.",
                "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
    }
}

From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java

License:Open Source License

@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//  w  w  w.j  a  v a 2 s. c om
public void testTokenStream_elisions() throws Exception {
    final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
    final StringBuilder query = new StringBuilder("foo bar baz bam ");
    // add all elisions to the query
    for (final String s : FrenchAnalyzer.DEFAULT_ELISIONS) {
        query.append(s).append("\'bim ");
    }
    final Analyzer analyzer = new FrenchAnalyzer(csa);
    final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = analyzer.tokenStream(null, query.toString())) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    }

    Assert.assertEquals("Not all terms returned.", 2L + FrenchAnalyzer.DEFAULT_ELISIONS.length, result.size());

    final BytesRefIterator bri = result.iterator();
    BytesRef term;
    while ((term = bri.next()) != null) {
        Assert.assertTrue("Unknown term found.",
                "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()) ||
                // elisions should be removed from this
                        "bim".equals(term.utf8ToString()));
    }
}

From source file:de.unihildesheim.iw.lucene.analyzer.GermanAnalyzerTest.java

License:Open Source License

@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test// ww w .j a  va  2 s .c om
public void testTokenStream_noStopwords() throws Exception {
    final String query = "foo bar baz bam";
    final Analyzer analyzer = new GermanAnalyzer();
    final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = analyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    }

    Assert.assertEquals("Not all terms returned.", 4L, result.size());

    final BytesRefIterator bri = result.iterator();
    BytesRef term;
    while ((term = bri.next()) != null) {
        Assert.assertTrue("Unknown term found.",
                "foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString())
                        || "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
    }
}

From source file:de.unihildesheim.iw.lucene.analyzer.GermanAnalyzerTest.java

License:Open Source License

@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test// w  ww .ja va 2s . c o  m
public void testTokenStream() throws Exception {
    final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
    final String query = "foo bar baz bam";
    final Analyzer analyzer = new GermanAnalyzer(csa);
    final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = analyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    }

    Assert.assertEquals("Not all terms returned.", 2L, result.size());

    final BytesRefIterator bri = result.iterator();
    BytesRef term;
    while ((term = bri.next()) != null) {
        Assert.assertTrue("Unknown term found.",
                "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
    }
}

From source file:de.unihildesheim.iw.lucene.index.FilteredDirectoryReaderTest.java

License:Open Source License

/**
 * Test basic {@link TermFilter} usage.//  w w w  .jav  a  2  s. c o m
 *
 * @throws Exception
 */
@SuppressWarnings({ "AnonymousInnerClassMayBeStatic", "ImplicitNumericConversion" })
@Test
public void testBuilder_termFilter() throws Exception {
    try (TestMemIndex idx = new TestMemIndex(Index.PLAIN)) {
        final String skipTerm = "first";
        final DirectoryReader reader = DirectoryReader.open(idx.dir);
        final FilteredDirectoryReader fReader = new Builder(reader).termFilter(new TermFilter() {
            @Override
            public boolean isAccepted(@Nullable final TermsEnum termsEnum, @NotNull final BytesRef term) {
                return !skipTerm.equals(term.utf8ToString());
            }
        }).build();

        new LeafReaderInstanceTest() {

            @Override
            void testHasDeletions() throws Exception {
                Assert.assertFalse("Reader has deletions.", fReader.hasDeletions());
            }

            @Override
            void testFieldCount() throws Exception {
                Assert.assertEquals("Field count mismatch.", idx.flds.size(), fReader.getFields().size());
            }

            @Override
            void testFieldNames() throws Exception {
                Assert.assertTrue("Visible field not found.", fReader.getFields().containsAll(idx.flds));
            }

            @Override
            void testTotalTermFreq() throws Exception {
                Assert.assertEquals("TotalTermFreq mismatch for visible term.", idx.docs,
                        fReader.totalTermFreq(new Term("f1", "field")));
                Assert.assertEquals("TotalTermFreq mismatch for missing term.", 0L,
                        fReader.totalTermFreq(new Term("f1", "foo")));
                Assert.assertEquals("TotalTermFreq mismatch for hidden term.", 0L,
                        fReader.totalTermFreq(new Term("f1", "first")));
            }

            @Override
            void testSumTotalTermFreq() throws Exception {
                Assert.assertEquals("SumTotalTermFreq mismatch for visible term.", 14L,
                        fReader.getSumTotalTermFreq("f1"));
            }

            @Override
            void testDocCount() throws Exception {
                Assert.assertEquals("Doc count mismatch.", idx.docs, fReader.getDocCount("f1"));
            }

            @SuppressWarnings("ObjectAllocationInLoop")
            @Override
            void testDocFreq() throws Exception {
                for (final String f : idx.flds) {
                    Assert.assertEquals("Missing term from all documents.", idx.docs,
                            fReader.docFreq(new Term(f, "value")));
                    Assert.assertEquals("Found hidden term.", 0L, fReader.docFreq(new Term(f, "first")));
                }
            }

            @Override
            void testSumDocFreq() throws Exception {
                Assert.assertEquals("SumDocFreq mismatch for visible term.", 14L, fReader.getSumDocFreq("f1"));
            }

            @Override
            void testTermVectors() throws Exception {
                final BytesRef term = new BytesRef("first");
                for (int i = 0; i < idx.docs - 1; i++) {
                    final Fields f = fReader.getTermVectors(i);
                    Assert.assertEquals("Too much fields retrieved from TermVector.", 1L, f.size());
                    final TermsEnum te = f.terms("f1").iterator(null);
                    Assert.assertFalse("Hidden term found.", te.seekExact(term));
                }
            }

            @Override
            void testNumDocs() throws Exception {
                Assert.assertEquals("NumDocs mismatch.", idx.docs, fReader.numDocs());
            }

            @Override
            void testMaxDoc() throws Exception {
                Assert.assertEquals("MaxDoc mismatch.", idx.docs, fReader.maxDoc());
            }
        };
    }
}

From source file:de.unihildesheim.iw.lucene.index.FilteredDirectoryReaderTest.java

License:Open Source License

/**
 * Test {@link Filter} usage in combination with {@link TermFilter}
 * restriction.// ww  w  .jav  a  2  s. c o  m
 *
 * @throws Exception
 */
@SuppressWarnings({ "AnonymousInnerClassMayBeStatic", "ImplicitNumericConversion" })
@Test
public void testBuilder_filter_and_termFilter() throws Exception {
    try (TestMemIndex idx = new TestMemIndex(Index.ALL_FIELDS)) {
        final String skipTerm = "document2field3";
        final Query q = new TermQuery(new Term("f1", "document2field1"));
        final Filter f = new QueryWrapperFilter(q);
        final DirectoryReader reader = DirectoryReader.open(idx.dir);
        final FilteredDirectoryReader fReader = new Builder(reader).queryFilter(f).termFilter(new TermFilter() {
            @Override
            public boolean isAccepted(@Nullable final TermsEnum termsEnum, @NotNull final BytesRef term) {
                return !skipTerm.equals(term.utf8ToString());
            }
        }).build();

        new LeafReaderInstanceTest() {

            @Override
            void testHasDeletions() throws Exception {
                Assert.assertFalse("Reader has deletions.", fReader.hasDeletions());
            }

            @Override
            void testFieldCount() throws Exception {
                Assert.assertEquals("Field count mismatch.", 3L, fReader.getFields().size());
            }

            @Override
            void testFieldNames() throws Exception {
                for (final String fld : idx.flds) {
                    Assert.assertTrue("Visible field not found.", fReader.getFields().contains(fld));
                }
            }

            @Override
            void testTotalTermFreq() throws Exception {
                Assert.assertEquals("TotalTermFreq mismatch for visible term.", 1L,
                        fReader.totalTermFreq(new Term("f1", "field1")));
                Assert.assertEquals("TotalTermFreq mismatch for visible term.", 1L,
                        fReader.totalTermFreq(new Term("f2", "field2")));
                Assert.assertEquals("TotalTermFreq mismatch for visible term.", 1L,
                        fReader.totalTermFreq(new Term("f3", "field3")));
                Assert.assertEquals("TotalTermFreq mismatch for hidden term.", 0L,
                        fReader.totalTermFreq(new Term("f3", "document2field3")));
            }

            @Override
            void testSumTotalTermFreq() throws Exception {
                Assert.assertEquals("SumTotalTermFreq mismatch for visible terms.", 6L,
                        fReader.getSumTotalTermFreq("f2"));
            }

            @Override
            void testDocCount() throws Exception {
                for (final String fld : idx.flds) {
                    Assert.assertEquals("Doc count mismatch.", 1L, fReader.getDocCount(fld));
                }
            }

            @SuppressWarnings("ObjectAllocationInLoop")
            @Override
            void testDocFreq() throws Exception {
                Assert.assertEquals("Missing term from visible document.", 1L,
                        fReader.docFreq(new Term("f2", "value")));
                Assert.assertEquals("Hidden term found.", 0L,
                        fReader.docFreq(new Term("f1", "document1field1")));
                Assert.assertEquals("Hidden term found.", 0L,
                        fReader.docFreq(new Term("f3", "document2field3")));
            }

            @Override
            void testSumDocFreq() throws Exception {
                Assert.assertEquals("SumDocFreq mismatch for visible term.", 6L, fReader.getSumDocFreq("f2"));
                Assert.assertEquals("SumDocFreq mismatch for visible term.", 5L, fReader.getSumDocFreq("f3"));
            }

            @Override
            void testTermVectors() throws Exception {
                boolean match = false;
                final BytesRef term = new BytesRef(skipTerm);
                for (int i = 0; i < fReader.maxDoc(); i++) {
                    final Fields fld = fReader.getTermVectors(i);
                    if (fld != null) {
                        match = true;
                        Assert.assertEquals("Number of fields retrieved from TermVector do not match.", 3L,
                                fld.size());
                        final Terms t = fld.terms("f3");
                        if (t != null) {
                            final TermsEnum te = t.iterator(null);
                            Assert.assertFalse("Hidden term found.", te.seekExact(term));
                        }
                    }
                }
                Assert.assertTrue("Fields not found.", match);
            }

            @Override
            void testNumDocs() throws Exception {
                Assert.assertEquals("NumDocs mismatch.", 1L, fReader.numDocs());
            }

            @Override
            void testMaxDoc() throws Exception {
                Assert.assertEquals("MaxDoc mismatch.", 2L, fReader.maxDoc());
            }
        };
    }
}

From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java

License:Open Source License

/**
 * Remove terms from the given collection, if they are not found in the
 * collection./*  w  ww.ja v  a  2 s.  co  m*/
 *
 * @param dataProv IndexDataProvider
 * @param terms Collection of terms to check against the collection
 * @return Passed in terms with non-collection terms removed
 */
@SuppressFBWarnings("LO_APPENDED_STRING_IN_FORMAT_STRING")
private static BytesRefArray removeUnknownTerms(@NotNull final IndexDataProvider dataProv,
        @NotNull final BytesRefArray terms) {
    final StringBuilder sb = new StringBuilder("Skipped terms (stopword or not in collection): [");
    final FixedBitSet bits = new FixedBitSet(terms.size());
    final BytesRefBuilder spare = new BytesRefBuilder();
    BytesRef term;

    if (terms.size() == 0) {
        return terms;
    } else {
        for (int i = terms.size() - 1; i >= 0; i--) {
            term = terms.get(spare, i);
            if (dataProv.getTermFrequency(term) <= 0L) {
                sb.append(term.utf8ToString()).append(' ');
                bits.set(i);
            }
        }

        if (bits.cardinality() > 0) {
            LOG.warn(sb.toString().trim() + "].");
            final BytesRefArray cleanTerms = new BytesRefArray(Counter.newCounter(false));
            for (int i = terms.size() - 1; i >= 0; i--) {
                if (!bits.get(i)) {
                    term = terms.get(spare, i);
                    cleanTerms.append(term); // copies bytes
                }
            }
            return cleanTerms;
        }
        return terms;
    }
}

From source file:de.unihildesheim.iw.lucene.query.QueryUtilsTest.java

License:Open Source License

@SuppressWarnings("ImplicitNumericConversion")
@Test// w w  w  .ja va2  s.  c om
public void testTokenizeQuery_noMetrics() throws Exception {
    final BytesRefArray bra = QueryUtils.tokenizeQuery("foo bar baz", ANALYZER, null);

    Assert.assertEquals("Extracted terms count mismatch.", 3L, bra.size());

    final BytesRefIterator braIt = bra.iterator();
    BytesRef term;
    while ((term = braIt.next()) != null) {
        final String termStr = term.utf8ToString();
        switch (termStr) {
        case "foo":
        case "bar":
        case "baz":
            break;
        default:
            Assert.fail("Unknown term found.");
            break;
        }
    }
}

From source file:de.unihildesheim.iw.lucene.query.QueryUtilsTest.java

License:Open Source License

/**
 * Test tokenizing with skipping terms not present in index.
 *
 * @throws Exception//from   w  w w .  j a v a2 s .c om
 */
@SuppressWarnings("ImplicitNumericConversion")
@Test
public void testTokenizeQuery() throws Exception {
    try (TestMemIndex idx = new TestMemIndex()) {
        final IndexDataProvider idp = idx.getIdp();

        final BytesRefArray bra = QueryUtils.tokenizeQuery("foo bar field baz value", ANALYZER, idp);

        Assert.assertEquals("Extracted terms count mismatch.", 2L, bra.size());

        final BytesRefIterator braIt = bra.iterator();
        BytesRef term;
        while ((term = braIt.next()) != null) {
            final String termStr = term.utf8ToString();
            switch (termStr) {
            case "foo":
            case "bar":
            case "baz":
                Assert.fail("Non-index term found.");
                break;
            case "value":
            case "field":
                // pass
                break;
            default:
                Assert.fail("Unknown term found.");
                break;
            }
        }
    }
}

From source file:de.uni_koeln.spinfo.textengineering.tm.classification.lucene.LuceneAdapter.java

License:Open Source License

@Override
public String classify(Document document) {
    try {//from   w ww. j  av a 2  s  .  c  o m
        ClassificationResult<BytesRef> result = classifier.assignClass(document.getText());
        BytesRef assignedClass = result.getAssignedClass();
        //         printAssignments(document, result);//optional
        return assignedClass.utf8ToString();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return null;
}