Example usage for org.apache.lucene.util BytesRef BytesRef

List of usage examples for org.apache.lucene.util BytesRef BytesRef

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef BytesRef.

Prototype

public BytesRef(CharSequence text) 

Source Link

Document

Initialize the byte[] from the UTF8 bytes for the provided String.

Usage

From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java

License:Open Source License

/**
 * Tokenizes a query string using Lucenes analyzer. This also removes
 * stopwords from the query string. The {@link IndexDataProvider} instance is
 * used to skip terms no found in the collection.
 *
 * @param query Query string to tokenize
 * @param qAnalyzer Analyzer to use//from   w  w  w.  j  a v  a2s .co  m
 * @param dataProv IndexDataProvider
 * @return List of tokens from original query with stop-words removed
 */
@SuppressWarnings("ObjectAllocationInLoop")
public static BytesRefArray tokenizeQuery(@NotNull final String query, @NotNull final Analyzer qAnalyzer,
        @Nullable final IndexDataProvider dataProv) {
    BytesRefArray result = new BytesRefArray(Counter.newCounter(false));

    try (TokenStream stream = qAnalyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (term.length > 0) {
                result.append(term);
            }
        }
    } catch (final IOException e) {
        // not thrown b/c we're using a string reader
    }
    if (dataProv != null) {
        result = removeUnknownTerms(dataProv, result);
    }
    return result;
}

From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java

License:Open Source License

/**
 * Tokenizes a query string using Lucenes analyzer. This also removes
 * stopwords from the query string. Returns a mapping of query-term to
 * in-query-frequency. The {@link IndexDataProvider} instance is used to skip
 * terms no found in the collection./*from   ww  w. j  av a 2s  . co m*/
 *
 * @param query Query String
 * @param qAnalyzer Analyzer used to parse the query String
 * @param dataProv IndexDataProvider
 * @return mapping of query-term to in-query-frequency with optionally terms
 * not in the collection skipped
 */
@SuppressWarnings("ObjectAllocationInLoop")
public static Map<BytesRef, Integer> tokenizeAndMapQuery(@NotNull final String query,
        @NotNull final Analyzer qAnalyzer, @Nullable final IndexDataProvider dataProv) {
    // estimate size
    final Map<BytesRef, Integer> result = new HashMap<>(
            (int) ((double) StringUtils.estimatedWordCount(query) * 1.8));
    try (TokenStream stream = qAnalyzer.tokenStream(null, query)) {
        stream.reset();
        while (stream.incrementToken()) {
            final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class));
            if (result.containsKey(term)) {
                result.put(BytesRef.deepCopyOf(term), result.get(term) + 1);
            } else {
                result.put(BytesRef.deepCopyOf(term), 1);
            }
        }
    } catch (final IOException e) {
        // not thrown b/c we're using a string reader
    }
    if (dataProv != null) {
        removeUnknownTerms(dataProv, result.keySet()).stream().forEach(result::remove);
    }
    return result;
}

From source file:de.unihildesheim.iw.lucene.scoring.clarity.DefaultClarityScoreTest.java

License:Open Source License

@SuppressWarnings("ImplicitNumericConversion")
@Test//ww  w  .ja  va 2  s  .  com
public void testModelLowPrecision_constructor() throws Exception {
    try (TestMemIndex idx = new TestMemIndex()) {
        final IndexDataProvider idp = idx.getIdp();
        // query
        final BytesRefArray qt = new BytesRefArray(Counter.newCounter(false));
        qt.append(new BytesRef("document1"));
        qt.append(new BytesRef("value"));
        // feedback documents
        @SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess")
        final DocIdSet fb = new RoaringDocIdSet.Builder(3).add(0).add(1).add(2).build();
        final ModelLowPrecision lpMod = new ModelLowPrecision(idp, LANG_MODEL_WEIGHT, qt, fb);

        Assert.assertEquals("Number of cached document models differs.", idx.docs, lpMod.docModels.size());
        Assert.assertEquals("Wrong number of feedback documents.", 3L, lpMod.feedbackDocs.length);
    }
}

From source file:de.unihildesheim.iw.lucene.scoring.clarity.DefaultClarityScoreTest.java

License:Open Source License

@SuppressWarnings("ImplicitNumericConversion")
@Test//from w w  w.  j ava2  s  .  co  m
public void testModelLowPrecision_docModel() throws Exception {
    try (TestMemIndex idx = new TestMemIndex()) {
        final IndexDataProvider idp = idx.getIdp();
        // query
        final BytesRefArray qt = new BytesRefArray(Counter.newCounter(false));
        qt.append(new BytesRef("document1"));
        qt.append(new BytesRef("value"));
        // feedback documents
        @SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess")
        final DocIdSet fb = new RoaringDocIdSet.Builder(3).add(0).add(1).add(2).build();
        final ModelLowPrecision lpMod = new ModelLowPrecision(idp, LANG_MODEL_WEIGHT, qt, fb);

        final BytesRef term = new BytesRef("document1");
        // (0.6 * (3/18)) + (1 - 0.6) * relTf(term)
        final double expected = (0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(term);
        final double result = lpMod.document(idp.getDocumentModel(0), term);
        Assert.assertEquals("Document model result differs.", expected, result, 0d);
    }
}

From source file:de.unihildesheim.iw.lucene.scoring.clarity.DefaultClarityScoreTest.java

License:Open Source License

@SuppressWarnings("ImplicitNumericConversion")
@Test/*from  w  w  w.ja  v  a  2 s. c  om*/
public void testModelLowPrecision_queryModel() throws Exception {
    try (TestMemIndex idx = new TestMemIndex()) {
        final IndexDataProvider idp = idx.getIdp();
        // query
        final BytesRef qt1 = new BytesRef("document1");
        final BytesRef qt2 = new BytesRef("value");
        final BytesRefArray qt = new BytesRefArray(Counter.newCounter(false));
        qt.append(qt1);
        qt.append(qt2);
        // feedback documents
        @SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess")
        final DocIdSet fb = new RoaringDocIdSet.Builder(3).add(0).add(1).add(2).build();
        final ModelLowPrecision lpMod = new ModelLowPrecision(idp, LANG_MODEL_WEIGHT, qt, fb);

        final BytesRef term = new BytesRef("document1");
        final double expected =
                // doc-0: docModel query term
                (((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(term)) *
                // doc-0: docModel for all query terms
                        ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt1))
                        * ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt2))) +
                // doc-1: docModel query term
                        ((1d - 0.6) * idp.getRelativeTermFrequency(term) *
                        // doc-1: docModel for all query terms
                                ((1d - 0.6) * idp.getRelativeTermFrequency(qt1))
                                * ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt2)))
                        +
                        // doc-2: docModel query term
                        ((1d - 0.6) * idp.getRelativeTermFrequency(term) *
                        // doc-2: docModel for all query terms
                                ((1d - 0.6) * idp.getRelativeTermFrequency(qt1))
                                * ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt2)));
        final double result = lpMod.query(term);
        Assert.assertEquals("Query model result differs.", expected, result, 0.1e10);
    }
}

From source file:de.unihildesheim.iw.lucene.scoring.clarity.DefaultClarityScoreTest.java

License:Open Source License

@SuppressWarnings("ImplicitNumericConversion")
@Test/*from   www.j av a 2s.c om*/
public void testModelHighPrecision_constructor() throws Exception {
    try (TestMemIndex idx = new TestMemIndex()) {
        final IndexDataProvider idp = idx.getIdp();
        // query
        final BytesRefArray qt = new BytesRefArray(Counter.newCounter(false));
        qt.append(new BytesRef("document1"));
        qt.append(new BytesRef("value"));
        // feedback documents
        @SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess")
        final DocIdSet fb = new RoaringDocIdSet.Builder(3).add(0).add(1).add(2).build();
        final ModelHighPrecision hpMod = new ModelHighPrecision(idp, BigDecimal.valueOf(LANG_MODEL_WEIGHT), qt,
                fb);

        Assert.assertEquals("Number of cached document models differs.", idx.docs, hpMod.docModels.size());
        Assert.assertEquals("Wrong number of feedback documents.", 3L, hpMod.feedbackDocs.length);
    }
}

From source file:de.unihildesheim.iw.lucene.scoring.clarity.DefaultClarityScoreTest.java

License:Open Source License

@SuppressWarnings("ImplicitNumericConversion")
@Test//from ww  w.j av  a  2 s.  com
public void testModelHighPrecision_docModel() throws Exception {
    try (TestMemIndex idx = new TestMemIndex()) {
        final IndexDataProvider idp = idx.getIdp();
        // query
        final BytesRefArray qt = new BytesRefArray(Counter.newCounter(false));
        qt.append(new BytesRef("document1"));
        qt.append(new BytesRef("value"));
        // feedback documents
        @SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess")
        final DocIdSet fb = new RoaringDocIdSet.Builder(3).add(0).add(1).add(2).build();
        final ModelHighPrecision hpMod = new ModelHighPrecision(idp, BigDecimal.valueOf(LANG_MODEL_WEIGHT), qt,
                fb);

        final BytesRef term = new BytesRef("document1");
        // (0.6 * (3/18)) + (1 - 0.6) * relTf(term)
        final double expected = (0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(term);
        final double result = hpMod.document(idp.getDocumentModel(0), term).doubleValue();
        Assert.assertEquals("Document model result differs.", expected, result, 0.1e10);
    }
}

From source file:de.unihildesheim.iw.lucene.scoring.clarity.DefaultClarityScoreTest.java

License:Open Source License

@SuppressWarnings("ImplicitNumericConversion")
@Test/* w  w  w.j a va2 s  .c  om*/
public void testModelHighPrecision_queryModel() throws Exception {
    try (TestMemIndex idx = new TestMemIndex()) {
        final IndexDataProvider idp = idx.getIdp();
        // query
        final BytesRef qt1 = new BytesRef("document1");
        final BytesRef qt2 = new BytesRef("value");
        final BytesRefArray qt = new BytesRefArray(Counter.newCounter(false));
        qt.append(qt1);
        qt.append(qt2);
        // feedback documents
        @SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess")
        final DocIdSet fb = new RoaringDocIdSet.Builder(3).add(0).add(1).add(2).build();
        final ModelHighPrecision hpMod = new ModelHighPrecision(idp, BigDecimal.valueOf(LANG_MODEL_WEIGHT), qt,
                fb);

        final BytesRef term = new BytesRef("document1");
        final double expected =
                // doc-0: docModel query term
                (((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(term)) *
                // doc-0: docModel for all query terms
                        ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt1))
                        * ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt2))) +
                // doc-1: docModel query term
                        ((1d - 0.6) * idp.getRelativeTermFrequency(term) *
                        // doc-1: docModel for all query terms
                                ((1d - 0.6) * idp.getRelativeTermFrequency(qt1))
                                * ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt2)))
                        +
                        // doc-2: docModel query term
                        ((1d - 0.6) * idp.getRelativeTermFrequency(term) *
                        // doc-2: docModel for all query terms
                                ((1d - 0.6) * idp.getRelativeTermFrequency(qt1))
                                * ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt2)));
        final double result = hpMod.query(term).doubleValue();
        Assert.assertEquals("Query model result differs.", expected, result, 0.1e10);
    }
}

From source file:de.unihildesheim.iw.lucene.scoring.clarity.SimplifiedClarityScoreTest.java

License:Open Source License

@SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess")
@Test/*from   w w w  .j av  a  2 s.  co m*/
public void testCalcScorePortion() throws Exception {
    try (TestMemIndex idx = new TestMemIndex()) {
        final SimplifiedClarityScore dcs = new SimplifiedClarityScore.Builder()
                .analyzer(new WhitespaceAnalyzer()).indexDataProvider(idx.getIdp()).build();

        final BytesRef term = new BytesRef("document1");
        final long inQueryFreq = 3L;
        final long queryLength = 8L;
        final ClarityScoreCalculation.ScoreTupleHighPrecision result = dcs.calcScorePortion(term, inQueryFreq,
                queryLength);

        final double expectedQMod = (double) inQueryFreq / (double) queryLength;
        final double expectedCMod = idx.getIdp().getRelativeTermFrequency(term);

        Assert.assertEquals("Query model value differs.", expectedQMod, result.qModel.doubleValue(), 0.1e10);
        Assert.assertEquals("Collection model value differs.", expectedCMod, result.cModel.doubleValue(),
                0.1e10);
    }
}

From source file:de.unihildesheim.iw.lucene.util.BytesRefUtilsTest.java

License:Open Source License

@Test
public void testCopyBytes() throws Exception {
    final BytesRef br = new BytesRef("foo");
    final BytesRef result = new BytesRef(BytesRefUtils.copyBytes(br));
    Assert.assertTrue("Bytes mismatch.", br.bytesEquals(result));
}