List of usage examples for org.apache.lucene.util BytesRefArray append
@Override public int append(BytesRef bytes)
From source file:de.unihildesheim.iw.lucene.analyzer.EnglishAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//from w w w . j a va 2s . co m
public void testTokenStream_noStopwords() throws Exception {
final String query = "foo bar baz bam";
final Analyzer analyzer = new EnglishAnalyzer();
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 4L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString())
|| "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.EnglishAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test// ww w . ja v a2 s .c om
public void testTokenStream() throws Exception {
final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
final String query = "foo bar baz bam";
final Analyzer analyzer = new EnglishAnalyzer(csa);
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 2L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test/*from w ww .j a va 2s .c o m*/
public void testTokenStream_noStopwords() throws Exception {
final String query = "foo bar baz bam";
final Analyzer analyzer = new FrenchAnalyzer();
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 4L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString())
|| "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//from w ww.java 2s.c om
public void testTokenStream() throws Exception {
final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
final String query = "foo bar baz bam";
final Analyzer analyzer = new FrenchAnalyzer(csa);
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 2L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test/*w w w. j av a 2s . c o m*/
public void testTokenStream_elisions() throws Exception {
final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
final StringBuilder query = new StringBuilder("foo bar baz bam ");
// add all elisions to the query
for (final String s : FrenchAnalyzer.DEFAULT_ELISIONS) {
query.append(s).append("\'bim ");
}
final Analyzer analyzer = new FrenchAnalyzer(csa);
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query.toString())) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 2L + FrenchAnalyzer.DEFAULT_ELISIONS.length, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()) ||
// elisions should be removed from this
"bim".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.GermanAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//from w w w .j a v a2s. c om
public void testTokenStream_noStopwords() throws Exception {
final String query = "foo bar baz bam";
final Analyzer analyzer = new GermanAnalyzer();
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 4L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString())
|| "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.GermanAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//w w w . ja v a 2 s. c o m
public void testTokenStream() throws Exception {
final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
final String query = "foo bar baz bam";
final Analyzer analyzer = new GermanAnalyzer(csa);
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 2L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java
License:Open Source License
/** * Tokenizes a query string using Lucenes analyzer. This also removes * stopwords from the query string. The {@link IndexDataProvider} instance is * used to skip terms no found in the collection. * * @param query Query string to tokenize * @param qAnalyzer Analyzer to use/*w w w . j a v a 2 s . c o m*/ * @param dataProv IndexDataProvider * @return List of tokens from original query with stop-words removed */ @SuppressWarnings("ObjectAllocationInLoop") public static BytesRefArray tokenizeQuery(@NotNull final String query, @NotNull final Analyzer qAnalyzer, @Nullable final IndexDataProvider dataProv) { BytesRefArray result = new BytesRefArray(Counter.newCounter(false)); try (TokenStream stream = qAnalyzer.tokenStream(null, query)) { stream.reset(); while (stream.incrementToken()) { final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class)); if (term.length > 0) { result.append(term); } } } catch (final IOException e) { // not thrown b/c we're using a string reader } if (dataProv != null) { result = removeUnknownTerms(dataProv, result); } return result; }
From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java
License:Open Source License
/** * Remove terms from the given collection, if they are not found in the * collection.//from ww w . j av a 2 s .c o m * * @param dataProv IndexDataProvider * @param terms Collection of terms to check against the collection * @return Passed in terms with non-collection terms removed */ @SuppressFBWarnings("LO_APPENDED_STRING_IN_FORMAT_STRING") private static BytesRefArray removeUnknownTerms(@NotNull final IndexDataProvider dataProv, @NotNull final BytesRefArray terms) { final StringBuilder sb = new StringBuilder("Skipped terms (stopword or not in collection): ["); final FixedBitSet bits = new FixedBitSet(terms.size()); final BytesRefBuilder spare = new BytesRefBuilder(); BytesRef term; if (terms.size() == 0) { return terms; } else { for (int i = terms.size() - 1; i >= 0; i--) { term = terms.get(spare, i); if (dataProv.getTermFrequency(term) <= 0L) { sb.append(term.utf8ToString()).append(' '); bits.set(i); } } if (bits.cardinality() > 0) { LOG.warn(sb.toString().trim() + "]."); final BytesRefArray cleanTerms = new BytesRefArray(Counter.newCounter(false)); for (int i = terms.size() - 1; i >= 0; i--) { if (!bits.get(i)) { term = terms.get(spare, i); cleanTerms.append(term); // copies bytes } } return cleanTerms; } return terms; } }
From source file:de.unihildesheim.iw.lucene.scoring.clarity.DefaultClarityScoreTest.java
License:Open Source License
@SuppressWarnings("ImplicitNumericConversion") @Test/*from www .ja v a 2 s .c o m*/ public void testModelLowPrecision_constructor() throws Exception { try (TestMemIndex idx = new TestMemIndex()) { final IndexDataProvider idp = idx.getIdp(); // query final BytesRefArray qt = new BytesRefArray(Counter.newCounter(false)); qt.append(new BytesRef("document1")); qt.append(new BytesRef("value")); // feedback documents @SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess") final DocIdSet fb = new RoaringDocIdSet.Builder(3).add(0).add(1).add(2).build(); final ModelLowPrecision lpMod = new ModelLowPrecision(idp, LANG_MODEL_WEIGHT, qt, fb); Assert.assertEquals("Number of cached document models differs.", idx.docs, lpMod.docModels.size()); Assert.assertEquals("Wrong number of feedback documents.", 3L, lpMod.feedbackDocs.length); } }