List of usage examples for org.apache.lucene.util BytesRefArray BytesRefArray
public BytesRefArray(Counter bytesUsed)
From source file:de.unihildesheim.iw.lucene.analyzer.EnglishAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test/*from ww w . j a v a2s. c o m*/
public void testTokenStream_noStopwords() throws Exception {
final String query = "foo bar baz bam";
final Analyzer analyzer = new EnglishAnalyzer();
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 4L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString())
|| "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.EnglishAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//from ww w .j a v a 2 s .co m
public void testTokenStream() throws Exception {
final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
final String query = "foo bar baz bam";
final Analyzer analyzer = new EnglishAnalyzer(csa);
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 2L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test// w ww . j a v a 2 s .c om
public void testTokenStream_noStopwords() throws Exception {
final String query = "foo bar baz bam";
final Analyzer analyzer = new FrenchAnalyzer();
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 4L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString())
|| "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test// w w w . j ava 2s .co m
public void testTokenStream() throws Exception {
final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
final String query = "foo bar baz bam";
final Analyzer analyzer = new FrenchAnalyzer(csa);
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 2L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test/*from www. j av a 2 s.c om*/
public void testTokenStream_elisions() throws Exception {
final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
final StringBuilder query = new StringBuilder("foo bar baz bam ");
// add all elisions to the query
for (final String s : FrenchAnalyzer.DEFAULT_ELISIONS) {
query.append(s).append("\'bim ");
}
final Analyzer analyzer = new FrenchAnalyzer(csa);
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query.toString())) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 2L + FrenchAnalyzer.DEFAULT_ELISIONS.length, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()) ||
// elisions should be removed from this
"bim".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.GermanAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//from www.jav a2 s. c om
public void testTokenStream_noStopwords() throws Exception {
final String query = "foo bar baz bam";
final Analyzer analyzer = new GermanAnalyzer();
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 4L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString())
|| "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.GermanAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//from www. ja v a 2 s. c om
public void testTokenStream() throws Exception {
final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
final String query = "foo bar baz bam";
final Analyzer analyzer = new GermanAnalyzer(csa);
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 2L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java
License:Open Source License
/** * Tokenizes a query string using Lucenes analyzer. This also removes * stopwords from the query string. The {@link IndexDataProvider} instance is * used to skip terms no found in the collection. * * @param query Query string to tokenize * @param qAnalyzer Analyzer to use/*from w w w. j av a 2s. c om*/ * @param dataProv IndexDataProvider * @return List of tokens from original query with stop-words removed */ @SuppressWarnings("ObjectAllocationInLoop") public static BytesRefArray tokenizeQuery(@NotNull final String query, @NotNull final Analyzer qAnalyzer, @Nullable final IndexDataProvider dataProv) { BytesRefArray result = new BytesRefArray(Counter.newCounter(false)); try (TokenStream stream = qAnalyzer.tokenStream(null, query)) { stream.reset(); while (stream.incrementToken()) { final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class)); if (term.length > 0) { result.append(term); } } } catch (final IOException e) { // not thrown b/c we're using a string reader } if (dataProv != null) { result = removeUnknownTerms(dataProv, result); } return result; }
From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java
License:Open Source License
/** * Remove terms from the given collection, if they are not found in the * collection./* w w w . j a v a2s . com*/ * * @param dataProv IndexDataProvider * @param terms Collection of terms to check against the collection * @return Passed in terms with non-collection terms removed */ @SuppressFBWarnings("LO_APPENDED_STRING_IN_FORMAT_STRING") private static BytesRefArray removeUnknownTerms(@NotNull final IndexDataProvider dataProv, @NotNull final BytesRefArray terms) { final StringBuilder sb = new StringBuilder("Skipped terms (stopword or not in collection): ["); final FixedBitSet bits = new FixedBitSet(terms.size()); final BytesRefBuilder spare = new BytesRefBuilder(); BytesRef term; if (terms.size() == 0) { return terms; } else { for (int i = terms.size() - 1; i >= 0; i--) { term = terms.get(spare, i); if (dataProv.getTermFrequency(term) <= 0L) { sb.append(term.utf8ToString()).append(' '); bits.set(i); } } if (bits.cardinality() > 0) { LOG.warn(sb.toString().trim() + "]."); final BytesRefArray cleanTerms = new BytesRefArray(Counter.newCounter(false)); for (int i = terms.size() - 1; i >= 0; i--) { if (!bits.get(i)) { term = terms.get(spare, i); cleanTerms.append(term); // copies bytes } } return cleanTerms; } return terms; } }
From source file:de.unihildesheim.iw.lucene.scoring.clarity.ClarityScoreResult.java
License:Open Source License
/** * Set the query terms used. Will be referenced only. * * @param qTerms Query terms//ww w. j av a2s .c o m */ final void setQueryTerms(@NotNull final Collection<BytesRef> qTerms) { this.queryTerms = new BytesRefArray(Counter.newCounter(false)); qTerms.stream().forEach(this.queryTerms::append); }