List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" }) @Test/*from ww w . ja va2 s . c o m*/ public void testTokenStream() throws Exception { final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true); final String query = "foo bar baz bam"; final Analyzer analyzer = new FrenchAnalyzer(csa); final BytesRefArray result = new BytesRefArray(Counter.newCounter(false)); try (TokenStream stream = analyzer.tokenStream(null, query)) { stream.reset(); while (stream.incrementToken()) { final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class)); if (term.length > 0) { result.append(term); } } } Assert.assertEquals("Not all terms returned.", 2L, result.size()); final BytesRefIterator bri = result.iterator(); BytesRef term; while ((term = bri.next()) != null) { Assert.assertTrue("Unknown term found.", "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString())); } }
From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" }) @Test// w w w.j a v a 2 s. c om public void testTokenStream_elisions() throws Exception { final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true); final StringBuilder query = new StringBuilder("foo bar baz bam "); // add all elisions to the query for (final String s : FrenchAnalyzer.DEFAULT_ELISIONS) { query.append(s).append("\'bim "); } final Analyzer analyzer = new FrenchAnalyzer(csa); final BytesRefArray result = new BytesRefArray(Counter.newCounter(false)); try (TokenStream stream = analyzer.tokenStream(null, query.toString())) { stream.reset(); while (stream.incrementToken()) { final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class)); if (term.length > 0) { result.append(term); } } } Assert.assertEquals("Not all terms returned.", 2L + FrenchAnalyzer.DEFAULT_ELISIONS.length, result.size()); final BytesRefIterator bri = result.iterator(); BytesRef term; while ((term = bri.next()) != null) { Assert.assertTrue("Unknown term found.", "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()) || // elisions should be removed from this "bim".equals(term.utf8ToString())); } }
From source file:de.unihildesheim.iw.lucene.analyzer.GermanAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" }) @Test// ww w .j a va 2 s .c om public void testTokenStream_noStopwords() throws Exception { final String query = "foo bar baz bam"; final Analyzer analyzer = new GermanAnalyzer(); final BytesRefArray result = new BytesRefArray(Counter.newCounter(false)); try (TokenStream stream = analyzer.tokenStream(null, query)) { stream.reset(); while (stream.incrementToken()) { final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class)); if (term.length > 0) { result.append(term); } } } Assert.assertEquals("Not all terms returned.", 4L, result.size()); final BytesRefIterator bri = result.iterator(); BytesRef term; while ((term = bri.next()) != null) { Assert.assertTrue("Unknown term found.", "foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString()) || "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString())); } }
From source file:de.unihildesheim.iw.lucene.analyzer.GermanAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" }) @Test// w ww .ja va 2s . c o m public void testTokenStream() throws Exception { final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true); final String query = "foo bar baz bam"; final Analyzer analyzer = new GermanAnalyzer(csa); final BytesRefArray result = new BytesRefArray(Counter.newCounter(false)); try (TokenStream stream = analyzer.tokenStream(null, query)) { stream.reset(); while (stream.incrementToken()) { final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class)); if (term.length > 0) { result.append(term); } } } Assert.assertEquals("Not all terms returned.", 2L, result.size()); final BytesRefIterator bri = result.iterator(); BytesRef term; while ((term = bri.next()) != null) { Assert.assertTrue("Unknown term found.", "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString())); } }
From source file:de.unihildesheim.iw.lucene.index.FilteredDirectoryReaderTest.java
License:Open Source License
/** * Test basic {@link TermFilter} usage.// w w w .jav a 2 s. c o m * * @throws Exception */ @SuppressWarnings({ "AnonymousInnerClassMayBeStatic", "ImplicitNumericConversion" }) @Test public void testBuilder_termFilter() throws Exception { try (TestMemIndex idx = new TestMemIndex(Index.PLAIN)) { final String skipTerm = "first"; final DirectoryReader reader = DirectoryReader.open(idx.dir); final FilteredDirectoryReader fReader = new Builder(reader).termFilter(new TermFilter() { @Override public boolean isAccepted(@Nullable final TermsEnum termsEnum, @NotNull final BytesRef term) { return !skipTerm.equals(term.utf8ToString()); } }).build(); new LeafReaderInstanceTest() { @Override void testHasDeletions() throws Exception { Assert.assertFalse("Reader has deletions.", fReader.hasDeletions()); } @Override void testFieldCount() throws Exception { Assert.assertEquals("Field count mismatch.", idx.flds.size(), fReader.getFields().size()); } @Override void testFieldNames() throws Exception { Assert.assertTrue("Visible field not found.", fReader.getFields().containsAll(idx.flds)); } @Override void testTotalTermFreq() throws Exception { Assert.assertEquals("TotalTermFreq mismatch for visible term.", idx.docs, fReader.totalTermFreq(new Term("f1", "field"))); Assert.assertEquals("TotalTermFreq mismatch for missing term.", 0L, fReader.totalTermFreq(new Term("f1", "foo"))); Assert.assertEquals("TotalTermFreq mismatch for hidden term.", 0L, fReader.totalTermFreq(new Term("f1", "first"))); } @Override void testSumTotalTermFreq() throws Exception { Assert.assertEquals("SumTotalTermFreq mismatch for visible term.", 14L, fReader.getSumTotalTermFreq("f1")); } @Override void testDocCount() throws Exception { Assert.assertEquals("Doc count mismatch.", idx.docs, fReader.getDocCount("f1")); } @SuppressWarnings("ObjectAllocationInLoop") @Override void testDocFreq() throws Exception { for (final String f : idx.flds) { Assert.assertEquals("Missing term from all documents.", idx.docs, fReader.docFreq(new Term(f, "value"))); Assert.assertEquals("Found hidden term.", 0L, fReader.docFreq(new Term(f, "first"))); } } @Override void testSumDocFreq() throws Exception { Assert.assertEquals("SumDocFreq mismatch for visible term.", 14L, fReader.getSumDocFreq("f1")); } @Override void testTermVectors() throws Exception { final BytesRef term = new BytesRef("first"); for (int i = 0; i < idx.docs - 1; i++) { final Fields f = fReader.getTermVectors(i); Assert.assertEquals("Too much fields retrieved from TermVector.", 1L, f.size()); final TermsEnum te = f.terms("f1").iterator(null); Assert.assertFalse("Hidden term found.", te.seekExact(term)); } } @Override void testNumDocs() throws Exception { Assert.assertEquals("NumDocs mismatch.", idx.docs, fReader.numDocs()); } @Override void testMaxDoc() throws Exception { Assert.assertEquals("MaxDoc mismatch.", idx.docs, fReader.maxDoc()); } }; } }
From source file:de.unihildesheim.iw.lucene.index.FilteredDirectoryReaderTest.java
License:Open Source License
/** * Test {@link Filter} usage in combination with {@link TermFilter} * restriction.// ww w .jav a 2 s. c o m * * @throws Exception */ @SuppressWarnings({ "AnonymousInnerClassMayBeStatic", "ImplicitNumericConversion" }) @Test public void testBuilder_filter_and_termFilter() throws Exception { try (TestMemIndex idx = new TestMemIndex(Index.ALL_FIELDS)) { final String skipTerm = "document2field3"; final Query q = new TermQuery(new Term("f1", "document2field1")); final Filter f = new QueryWrapperFilter(q); final DirectoryReader reader = DirectoryReader.open(idx.dir); final FilteredDirectoryReader fReader = new Builder(reader).queryFilter(f).termFilter(new TermFilter() { @Override public boolean isAccepted(@Nullable final TermsEnum termsEnum, @NotNull final BytesRef term) { return !skipTerm.equals(term.utf8ToString()); } }).build(); new LeafReaderInstanceTest() { @Override void testHasDeletions() throws Exception { Assert.assertFalse("Reader has deletions.", fReader.hasDeletions()); } @Override void testFieldCount() throws Exception { Assert.assertEquals("Field count mismatch.", 3L, fReader.getFields().size()); } @Override void testFieldNames() throws Exception { for (final String fld : idx.flds) { Assert.assertTrue("Visible field not found.", fReader.getFields().contains(fld)); } } @Override void testTotalTermFreq() throws Exception { Assert.assertEquals("TotalTermFreq mismatch for visible term.", 1L, fReader.totalTermFreq(new Term("f1", "field1"))); Assert.assertEquals("TotalTermFreq mismatch for visible term.", 1L, fReader.totalTermFreq(new Term("f2", "field2"))); Assert.assertEquals("TotalTermFreq mismatch for visible term.", 1L, fReader.totalTermFreq(new Term("f3", "field3"))); Assert.assertEquals("TotalTermFreq mismatch for hidden term.", 0L, fReader.totalTermFreq(new Term("f3", "document2field3"))); } @Override void testSumTotalTermFreq() throws Exception { Assert.assertEquals("SumTotalTermFreq mismatch for visible terms.", 6L, fReader.getSumTotalTermFreq("f2")); } @Override void testDocCount() throws Exception { for (final String fld : idx.flds) { Assert.assertEquals("Doc count mismatch.", 1L, fReader.getDocCount(fld)); } } @SuppressWarnings("ObjectAllocationInLoop") @Override void testDocFreq() throws Exception { Assert.assertEquals("Missing term from visible document.", 1L, fReader.docFreq(new Term("f2", "value"))); Assert.assertEquals("Hidden term found.", 0L, fReader.docFreq(new Term("f1", "document1field1"))); Assert.assertEquals("Hidden term found.", 0L, fReader.docFreq(new Term("f3", "document2field3"))); } @Override void testSumDocFreq() throws Exception { Assert.assertEquals("SumDocFreq mismatch for visible term.", 6L, fReader.getSumDocFreq("f2")); Assert.assertEquals("SumDocFreq mismatch for visible term.", 5L, fReader.getSumDocFreq("f3")); } @Override void testTermVectors() throws Exception { boolean match = false; final BytesRef term = new BytesRef(skipTerm); for (int i = 0; i < fReader.maxDoc(); i++) { final Fields fld = fReader.getTermVectors(i); if (fld != null) { match = true; Assert.assertEquals("Number of fields retrieved from TermVector do not match.", 3L, fld.size()); final Terms t = fld.terms("f3"); if (t != null) { final TermsEnum te = t.iterator(null); Assert.assertFalse("Hidden term found.", te.seekExact(term)); } } } Assert.assertTrue("Fields not found.", match); } @Override void testNumDocs() throws Exception { Assert.assertEquals("NumDocs mismatch.", 1L, fReader.numDocs()); } @Override void testMaxDoc() throws Exception { Assert.assertEquals("MaxDoc mismatch.", 2L, fReader.maxDoc()); } }; } }
From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java
License:Open Source License
/** * Remove terms from the given collection, if they are not found in the * collection./* w ww.ja v a 2 s. co m*/ * * @param dataProv IndexDataProvider * @param terms Collection of terms to check against the collection * @return Passed in terms with non-collection terms removed */ @SuppressFBWarnings("LO_APPENDED_STRING_IN_FORMAT_STRING") private static BytesRefArray removeUnknownTerms(@NotNull final IndexDataProvider dataProv, @NotNull final BytesRefArray terms) { final StringBuilder sb = new StringBuilder("Skipped terms (stopword or not in collection): ["); final FixedBitSet bits = new FixedBitSet(terms.size()); final BytesRefBuilder spare = new BytesRefBuilder(); BytesRef term; if (terms.size() == 0) { return terms; } else { for (int i = terms.size() - 1; i >= 0; i--) { term = terms.get(spare, i); if (dataProv.getTermFrequency(term) <= 0L) { sb.append(term.utf8ToString()).append(' '); bits.set(i); } } if (bits.cardinality() > 0) { LOG.warn(sb.toString().trim() + "]."); final BytesRefArray cleanTerms = new BytesRefArray(Counter.newCounter(false)); for (int i = terms.size() - 1; i >= 0; i--) { if (!bits.get(i)) { term = terms.get(spare, i); cleanTerms.append(term); // copies bytes } } return cleanTerms; } return terms; } }
From source file:de.unihildesheim.iw.lucene.query.QueryUtilsTest.java
License:Open Source License
@SuppressWarnings("ImplicitNumericConversion") @Test// w w w .ja va2 s. c om public void testTokenizeQuery_noMetrics() throws Exception { final BytesRefArray bra = QueryUtils.tokenizeQuery("foo bar baz", ANALYZER, null); Assert.assertEquals("Extracted terms count mismatch.", 3L, bra.size()); final BytesRefIterator braIt = bra.iterator(); BytesRef term; while ((term = braIt.next()) != null) { final String termStr = term.utf8ToString(); switch (termStr) { case "foo": case "bar": case "baz": break; default: Assert.fail("Unknown term found."); break; } } }
From source file:de.unihildesheim.iw.lucene.query.QueryUtilsTest.java
License:Open Source License
/** * Test tokenizing with skipping terms not present in index. * * @throws Exception//from w w w . j a v a2 s .c om */ @SuppressWarnings("ImplicitNumericConversion") @Test public void testTokenizeQuery() throws Exception { try (TestMemIndex idx = new TestMemIndex()) { final IndexDataProvider idp = idx.getIdp(); final BytesRefArray bra = QueryUtils.tokenizeQuery("foo bar field baz value", ANALYZER, idp); Assert.assertEquals("Extracted terms count mismatch.", 2L, bra.size()); final BytesRefIterator braIt = bra.iterator(); BytesRef term; while ((term = braIt.next()) != null) { final String termStr = term.utf8ToString(); switch (termStr) { case "foo": case "bar": case "baz": Assert.fail("Non-index term found."); break; case "value": case "field": // pass break; default: Assert.fail("Unknown term found."); break; } } } }
From source file:de.uni_koeln.spinfo.textengineering.tm.classification.lucene.LuceneAdapter.java
License:Open Source License
@Override public String classify(Document document) { try {//from w ww. j av a 2 s . c o m ClassificationResult<BytesRef> result = classifier.assignClass(document.getText()); BytesRef assignedClass = result.getAssignedClass(); // printAssignments(document, result);//optional return assignedClass.utf8ToString(); } catch (IOException e) { e.printStackTrace(); } return null; }