List of usage examples for org.apache.lucene.util BytesRefArray size
@Override public int size()
From source file:de.unihildesheim.iw.lucene.analyzer.EnglishAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test/*from w ww . j a va 2 s. c om*/
public void testTokenStream_noStopwords() throws Exception {
final String query = "foo bar baz bam";
final Analyzer analyzer = new EnglishAnalyzer();
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 4L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString())
|| "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.EnglishAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test// w w w . j a va 2 s. com
public void testTokenStream() throws Exception {
final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
final String query = "foo bar baz bam";
final Analyzer analyzer = new EnglishAnalyzer(csa);
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 2L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test/* www.j av a 2s . c om*/
public void testTokenStream_noStopwords() throws Exception {
final String query = "foo bar baz bam";
final Analyzer analyzer = new FrenchAnalyzer();
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 4L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString())
|| "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//from w w w . j av a 2 s . c o m
public void testTokenStream() throws Exception {
final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
final String query = "foo bar baz bam";
final Analyzer analyzer = new FrenchAnalyzer(csa);
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 2L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//from w w w. ja v a2 s.com
public void testTokenStream_elisions() throws Exception {
final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
final StringBuilder query = new StringBuilder("foo bar baz bam ");
// add all elisions to the query
for (final String s : FrenchAnalyzer.DEFAULT_ELISIONS) {
query.append(s).append("\'bim ");
}
final Analyzer analyzer = new FrenchAnalyzer(csa);
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query.toString())) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 2L + FrenchAnalyzer.DEFAULT_ELISIONS.length, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()) ||
// elisions should be removed from this
"bim".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.GermanAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test/*from w w w. ja v a 2 s.c o m*/
public void testTokenStream_noStopwords() throws Exception {
final String query = "foo bar baz bam";
final Analyzer analyzer = new GermanAnalyzer();
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 4L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString())
|| "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.analyzer.GermanAnalyzerTest.java
License:Open Source License
@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//from w w w . java 2 s . com
public void testTokenStream() throws Exception {
final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
final String query = "foo bar baz bam";
final Analyzer analyzer = new GermanAnalyzer(csa);
final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));
try (TokenStream stream = analyzer.tokenStream(null, query)) {
stream.reset();
while (stream.incrementToken()) {
final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
if (term.length > 0) {
result.append(term);
}
}
}
Assert.assertEquals("Not all terms returned.", 2L, result.size());
final BytesRefIterator bri = result.iterator();
BytesRef term;
while ((term = bri.next()) != null) {
Assert.assertTrue("Unknown term found.",
"baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
}
}
From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java
License:Open Source License
/** * Remove terms from the given collection, if they are not found in the * collection./*from www . j ava2s . com*/ * * @param dataProv IndexDataProvider * @param terms Collection of terms to check against the collection * @return Passed in terms with non-collection terms removed */ @SuppressFBWarnings("LO_APPENDED_STRING_IN_FORMAT_STRING") private static BytesRefArray removeUnknownTerms(@NotNull final IndexDataProvider dataProv, @NotNull final BytesRefArray terms) { final StringBuilder sb = new StringBuilder("Skipped terms (stopword or not in collection): ["); final FixedBitSet bits = new FixedBitSet(terms.size()); final BytesRefBuilder spare = new BytesRefBuilder(); BytesRef term; if (terms.size() == 0) { return terms; } else { for (int i = terms.size() - 1; i >= 0; i--) { term = terms.get(spare, i); if (dataProv.getTermFrequency(term) <= 0L) { sb.append(term.utf8ToString()).append(' '); bits.set(i); } } if (bits.cardinality() > 0) { LOG.warn(sb.toString().trim() + "]."); final BytesRefArray cleanTerms = new BytesRefArray(Counter.newCounter(false)); for (int i = terms.size() - 1; i >= 0; i--) { if (!bits.get(i)) { term = terms.get(spare, i); cleanTerms.append(term); // copies bytes } } return cleanTerms; } return terms; } }
From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java
License:Open Source License
/** * Tokenizes a query string using Lucenes analyzer. This also removes * stopwords from the query string. The {@link IndexDataProvider} instance is * used to skip terms no found in the collection. * * @param query Query string to tokenize * @param qAnalyzer Analyzer to use//from w w w . jav a 2 s.c om * @param dataProv IndexDataProvider * @return List of tokens from original query with stop-words removed * @see #tokenizeQuery(String, Analyzer, IndexDataProvider) */ public static List<String> tokenizeQueryString(@NotNull final String query, @NotNull final Analyzer qAnalyzer, @Nullable final IndexDataProvider dataProv) { final BytesRefArray tokenizedQuery = tokenizeQuery(query, qAnalyzer, dataProv); final List<String> tokenizedQueryStr = new ArrayList<>(tokenizedQuery.size()); tokenizedQueryStr.addAll( StreamUtils.stream(tokenizedQuery).map(BytesRef::utf8ToString).collect(Collectors.toList())); return tokenizedQueryStr; }
From source file:de.unihildesheim.iw.lucene.query.QueryUtilsTest.java
License:Open Source License
@SuppressWarnings("ImplicitNumericConversion") @Test//ww w .j av a 2s .co m public void testTokenizeQuery_noMetrics() throws Exception { final BytesRefArray bra = QueryUtils.tokenizeQuery("foo bar baz", ANALYZER, null); Assert.assertEquals("Extracted terms count mismatch.", 3L, bra.size()); final BytesRefIterator braIt = bra.iterator(); BytesRef term; while ((term = braIt.next()) != null) { final String termStr = term.utf8ToString(); switch (termStr) { case "foo": case "bar": case "baz": break; default: Assert.fail("Unknown term found."); break; } } }