Example usage for org.apache.lucene.util BytesRefArray BytesRefArray

List of usage examples for org.apache.lucene.util BytesRefArray BytesRefArray

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRefArray BytesRefArray.

Prototype

public BytesRefArray(Counter bytesUsed) 

Source Link

Document

Creates a new BytesRefArray with a counter to track allocated bytes

Usage

From source file:de.unihildesheim.iw.lucene.analyzer.EnglishAnalyzerTest.java

License:Open Source License

@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test/*from  ww w . j  a  v  a2s.  c o m*/
public void testTokenStream_noStopwords() throws Exception {
    final String query = "foo bar baz bam";
    final Analyzer analyzer = new EnglishAnalyzer();
    final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = analyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    }

    Assert.assertEquals("Not all terms returned.", 4L, result.size());

    final BytesRefIterator bri = result.iterator();
    BytesRef term;
    while ((term = bri.next()) != null) {
        Assert.assertTrue("Unknown term found.",
                "foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString())
                        || "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
    }
}

From source file:de.unihildesheim.iw.lucene.analyzer.EnglishAnalyzerTest.java

License:Open Source License

@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//from   ww  w .j  a  v  a  2 s  .co  m
public void testTokenStream() throws Exception {
    final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
    final String query = "foo bar baz bam";
    final Analyzer analyzer = new EnglishAnalyzer(csa);
    final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = analyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    }

    Assert.assertEquals("Not all terms returned.", 2L, result.size());

    final BytesRefIterator bri = result.iterator();
    BytesRef term;
    while ((term = bri.next()) != null) {
        Assert.assertTrue("Unknown term found.",
                "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
    }
}

From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java

License:Open Source License

@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test// w ww  . j  a v a  2 s  .c om
public void testTokenStream_noStopwords() throws Exception {
    final String query = "foo bar baz bam";
    final Analyzer analyzer = new FrenchAnalyzer();
    final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = analyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    }

    Assert.assertEquals("Not all terms returned.", 4L, result.size());

    final BytesRefIterator bri = result.iterator();
    BytesRef term;
    while ((term = bri.next()) != null) {
        Assert.assertTrue("Unknown term found.",
                "foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString())
                        || "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
    }
}

From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java

License:Open Source License

@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//  w w w  .  j ava  2s .co m
public void testTokenStream() throws Exception {
    final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
    final String query = "foo bar baz bam";
    final Analyzer analyzer = new FrenchAnalyzer(csa);
    final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = analyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    }

    Assert.assertEquals("Not all terms returned.", 2L, result.size());

    final BytesRefIterator bri = result.iterator();
    BytesRef term;
    while ((term = bri.next()) != null) {
        Assert.assertTrue("Unknown term found.",
                "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
    }
}

From source file:de.unihildesheim.iw.lucene.analyzer.FrenchAnalyzerTest.java

License:Open Source License

@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test/*from  www. j av  a 2 s.c  om*/
public void testTokenStream_elisions() throws Exception {
    final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
    final StringBuilder query = new StringBuilder("foo bar baz bam ");
    // add all elisions to the query
    for (final String s : FrenchAnalyzer.DEFAULT_ELISIONS) {
        query.append(s).append("\'bim ");
    }
    final Analyzer analyzer = new FrenchAnalyzer(csa);
    final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = analyzer.tokenStream(null, query.toString())) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    }

    Assert.assertEquals("Not all terms returned.", 2L + FrenchAnalyzer.DEFAULT_ELISIONS.length, result.size());

    final BytesRefIterator bri = result.iterator();
    BytesRef term;
    while ((term = bri.next()) != null) {
        Assert.assertTrue("Unknown term found.",
                "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()) ||
                // elisions should be removed from this
                        "bim".equals(term.utf8ToString()));
    }
}

From source file:de.unihildesheim.iw.lucene.analyzer.GermanAnalyzerTest.java

License:Open Source License

@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//from  www.jav  a2 s. c om
public void testTokenStream_noStopwords() throws Exception {
    final String query = "foo bar baz bam";
    final Analyzer analyzer = new GermanAnalyzer();
    final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = analyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    }

    Assert.assertEquals("Not all terms returned.", 4L, result.size());

    final BytesRefIterator bri = result.iterator();
    BytesRef term;
    while ((term = bri.next()) != null) {
        Assert.assertTrue("Unknown term found.",
                "foo".equals(term.utf8ToString()) || "bar".equals(term.utf8ToString())
                        || "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
    }
}

From source file:de.unihildesheim.iw.lucene.analyzer.GermanAnalyzerTest.java

License:Open Source License

@SuppressWarnings({ "resource", "ObjectAllocationInLoop", "ImplicitNumericConversion" })
@Test//from www.  ja  v a 2  s. c  om
public void testTokenStream() throws Exception {
    final CharArraySet csa = new CharArraySet(Arrays.asList("foo", "bar"), true);
    final String query = "foo bar baz bam";
    final Analyzer analyzer = new GermanAnalyzer(csa);
    final BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = analyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    }

    Assert.assertEquals("Not all terms returned.", 2L, result.size());

    final BytesRefIterator bri = result.iterator();
    BytesRef term;
    while ((term = bri.next()) != null) {
        Assert.assertTrue("Unknown term found.",
                "baz".equals(term.utf8ToString()) || "bam".equals(term.utf8ToString()));
    }
}

From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java

License:Open Source License

/**
 * Tokenizes a query string using Lucenes analyzer. This also removes
 * stopwords from the query string. The {@link IndexDataProvider} instance is
 * used to skip terms no found in the collection.
 *
 * @param query Query string to tokenize
 * @param qAnalyzer Analyzer to use/*from   w w  w.  j av  a  2s.  c om*/
 * @param dataProv IndexDataProvider
 * @return List of tokens from original query with stop-words removed
 */
@SuppressWarnings("ObjectAllocationInLoop")
public static BytesRefArray tokenizeQuery(@NotNull final String query, @NotNull final Analyzer qAnalyzer,
        @Nullable final IndexDataProvider dataProv) {
    BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = qAnalyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    } catch (final IOException e) {
        // not thrown b/c we're using a string reader
    }
    if (dataProv != null) {
        result = removeUnknownTerms(dataProv, result);
    }
    return result;
}

From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java

License:Open Source License

/**
 * Remove terms from the given collection, if they are not found in the
 * collection./*  w w  w  .  j a  v  a2s .  com*/
 *
 * @param dataProv IndexDataProvider
 * @param terms Collection of terms to check against the collection
 * @return Passed in terms with non-collection terms removed
 */
@SuppressFBWarnings("LO_APPENDED_STRING_IN_FORMAT_STRING")
private static BytesRefArray removeUnknownTerms(@NotNull final IndexDataProvider dataProv,
        @NotNull final BytesRefArray terms) {
    final StringBuilder sb = new StringBuilder("Skipped terms (stopword or not in collection): [");
    final FixedBitSet bits = new FixedBitSet(terms.size());
    final BytesRefBuilder spare = new BytesRefBuilder();
    BytesRef term;

    if (terms.size() == 0) {
        return terms;
    } else {
        for (int i = terms.size() - 1; i >= 0; i--) {
            term = terms.get(spare, i);
            if (dataProv.getTermFrequency(term) <= 0L) {
                sb.append(term.utf8ToString()).append(' ');
                bits.set(i);
            }
        }

        if (bits.cardinality() > 0) {
            LOG.warn(sb.toString().trim() + "].");
            final BytesRefArray cleanTerms = new BytesRefArray(Counter.newCounter(false));
            for (int i = terms.size() - 1; i >= 0; i--) {
                if (!bits.get(i)) {
                    term = terms.get(spare, i);
                    cleanTerms.append(term); // copies bytes
                }
            }
            return cleanTerms;
        }
        return terms;
    }
}

From source file:de.unihildesheim.iw.lucene.scoring.clarity.ClarityScoreResult.java

License:Open Source License

/**
 * Set the query terms used. Will be referenced only.
 *
 * @param qTerms Query terms//ww  w. j av a2s  .c o  m
 */
final void setQueryTerms(@NotNull final Collection<BytesRef> qTerms) {
    this.queryTerms = new BytesRefArray(Counter.newCounter(false));
    qTerms.stream().forEach(this.queryTerms::append);
}