List of usage examples for org.apache.lucene.util BytesRef BytesRef
public BytesRef(CharSequence text)
From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java
License:Open Source License
/** * Tokenizes a query string using Lucenes analyzer. This also removes * stopwords from the query string. The {@link IndexDataProvider} instance is * used to skip terms no found in the collection. * * @param query Query string to tokenize * @param qAnalyzer Analyzer to use//from w w w. j a v a2s .co m * @param dataProv IndexDataProvider * @return List of tokens from original query with stop-words removed */ @SuppressWarnings("ObjectAllocationInLoop") public static BytesRefArray tokenizeQuery(@NotNull final String query, @NotNull final Analyzer qAnalyzer, @Nullable final IndexDataProvider dataProv) { BytesRefArray result = new BytesRefArray(Counter.newCounter(false)); try (TokenStream stream = qAnalyzer.tokenStream(null, query)) { stream.reset(); while (stream.incrementToken()) { final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class)); if (term.length > 0) { result.append(term); } } } catch (final IOException e) { // not thrown b/c we're using a string reader } if (dataProv != null) { result = removeUnknownTerms(dataProv, result); } return result; }
From source file:de.unihildesheim.iw.lucene.query.QueryUtils.java
License:Open Source License
/** * Tokenizes a query string using Lucenes analyzer. This also removes * stopwords from the query string. Returns a mapping of query-term to * in-query-frequency. The {@link IndexDataProvider} instance is used to skip * terms no found in the collection./*from ww w. j av a 2s . co m*/ * * @param query Query String * @param qAnalyzer Analyzer used to parse the query String * @param dataProv IndexDataProvider * @return mapping of query-term to in-query-frequency with optionally terms * not in the collection skipped */ @SuppressWarnings("ObjectAllocationInLoop") public static Map<BytesRef, Integer> tokenizeAndMapQuery(@NotNull final String query, @NotNull final Analyzer qAnalyzer, @Nullable final IndexDataProvider dataProv) { // estimate size final Map<BytesRef, Integer> result = new HashMap<>( (int) ((double) StringUtils.estimatedWordCount(query) * 1.8)); try (TokenStream stream = qAnalyzer.tokenStream(null, query)) { stream.reset(); while (stream.incrementToken()) { final BytesRef term = new BytesRef(stream.getAttribute(CharTermAttribute.class)); if (result.containsKey(term)) { result.put(BytesRef.deepCopyOf(term), result.get(term) + 1); } else { result.put(BytesRef.deepCopyOf(term), 1); } } } catch (final IOException e) { // not thrown b/c we're using a string reader } if (dataProv != null) { removeUnknownTerms(dataProv, result.keySet()).stream().forEach(result::remove); } return result; }
From source file:de.unihildesheim.iw.lucene.scoring.clarity.DefaultClarityScoreTest.java
License:Open Source License
@SuppressWarnings("ImplicitNumericConversion") @Test//ww w .ja va 2 s . com public void testModelLowPrecision_constructor() throws Exception { try (TestMemIndex idx = new TestMemIndex()) { final IndexDataProvider idp = idx.getIdp(); // query final BytesRefArray qt = new BytesRefArray(Counter.newCounter(false)); qt.append(new BytesRef("document1")); qt.append(new BytesRef("value")); // feedback documents @SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess") final DocIdSet fb = new RoaringDocIdSet.Builder(3).add(0).add(1).add(2).build(); final ModelLowPrecision lpMod = new ModelLowPrecision(idp, LANG_MODEL_WEIGHT, qt, fb); Assert.assertEquals("Number of cached document models differs.", idx.docs, lpMod.docModels.size()); Assert.assertEquals("Wrong number of feedback documents.", 3L, lpMod.feedbackDocs.length); } }
From source file:de.unihildesheim.iw.lucene.scoring.clarity.DefaultClarityScoreTest.java
License:Open Source License
@SuppressWarnings("ImplicitNumericConversion") @Test//from w w w. j ava2 s . co m public void testModelLowPrecision_docModel() throws Exception { try (TestMemIndex idx = new TestMemIndex()) { final IndexDataProvider idp = idx.getIdp(); // query final BytesRefArray qt = new BytesRefArray(Counter.newCounter(false)); qt.append(new BytesRef("document1")); qt.append(new BytesRef("value")); // feedback documents @SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess") final DocIdSet fb = new RoaringDocIdSet.Builder(3).add(0).add(1).add(2).build(); final ModelLowPrecision lpMod = new ModelLowPrecision(idp, LANG_MODEL_WEIGHT, qt, fb); final BytesRef term = new BytesRef("document1"); // (0.6 * (3/18)) + (1 - 0.6) * relTf(term) final double expected = (0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(term); final double result = lpMod.document(idp.getDocumentModel(0), term); Assert.assertEquals("Document model result differs.", expected, result, 0d); } }
From source file:de.unihildesheim.iw.lucene.scoring.clarity.DefaultClarityScoreTest.java
License:Open Source License
@SuppressWarnings("ImplicitNumericConversion") @Test/*from w w w.ja v a 2 s. c om*/ public void testModelLowPrecision_queryModel() throws Exception { try (TestMemIndex idx = new TestMemIndex()) { final IndexDataProvider idp = idx.getIdp(); // query final BytesRef qt1 = new BytesRef("document1"); final BytesRef qt2 = new BytesRef("value"); final BytesRefArray qt = new BytesRefArray(Counter.newCounter(false)); qt.append(qt1); qt.append(qt2); // feedback documents @SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess") final DocIdSet fb = new RoaringDocIdSet.Builder(3).add(0).add(1).add(2).build(); final ModelLowPrecision lpMod = new ModelLowPrecision(idp, LANG_MODEL_WEIGHT, qt, fb); final BytesRef term = new BytesRef("document1"); final double expected = // doc-0: docModel query term (((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(term)) * // doc-0: docModel for all query terms ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt1)) * ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt2))) + // doc-1: docModel query term ((1d - 0.6) * idp.getRelativeTermFrequency(term) * // doc-1: docModel for all query terms ((1d - 0.6) * idp.getRelativeTermFrequency(qt1)) * ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt2))) + // doc-2: docModel query term ((1d - 0.6) * idp.getRelativeTermFrequency(term) * // doc-2: docModel for all query terms ((1d - 0.6) * idp.getRelativeTermFrequency(qt1)) * ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt2))); final double result = lpMod.query(term); Assert.assertEquals("Query model result differs.", expected, result, 0.1e10); } }
From source file:de.unihildesheim.iw.lucene.scoring.clarity.DefaultClarityScoreTest.java
License:Open Source License
@SuppressWarnings("ImplicitNumericConversion") @Test/*from www.j av a 2s.c om*/ public void testModelHighPrecision_constructor() throws Exception { try (TestMemIndex idx = new TestMemIndex()) { final IndexDataProvider idp = idx.getIdp(); // query final BytesRefArray qt = new BytesRefArray(Counter.newCounter(false)); qt.append(new BytesRef("document1")); qt.append(new BytesRef("value")); // feedback documents @SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess") final DocIdSet fb = new RoaringDocIdSet.Builder(3).add(0).add(1).add(2).build(); final ModelHighPrecision hpMod = new ModelHighPrecision(idp, BigDecimal.valueOf(LANG_MODEL_WEIGHT), qt, fb); Assert.assertEquals("Number of cached document models differs.", idx.docs, hpMod.docModels.size()); Assert.assertEquals("Wrong number of feedback documents.", 3L, hpMod.feedbackDocs.length); } }
From source file:de.unihildesheim.iw.lucene.scoring.clarity.DefaultClarityScoreTest.java
License:Open Source License
@SuppressWarnings("ImplicitNumericConversion") @Test//from ww w.j av a 2 s. com public void testModelHighPrecision_docModel() throws Exception { try (TestMemIndex idx = new TestMemIndex()) { final IndexDataProvider idp = idx.getIdp(); // query final BytesRefArray qt = new BytesRefArray(Counter.newCounter(false)); qt.append(new BytesRef("document1")); qt.append(new BytesRef("value")); // feedback documents @SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess") final DocIdSet fb = new RoaringDocIdSet.Builder(3).add(0).add(1).add(2).build(); final ModelHighPrecision hpMod = new ModelHighPrecision(idp, BigDecimal.valueOf(LANG_MODEL_WEIGHT), qt, fb); final BytesRef term = new BytesRef("document1"); // (0.6 * (3/18)) + (1 - 0.6) * relTf(term) final double expected = (0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(term); final double result = hpMod.document(idp.getDocumentModel(0), term).doubleValue(); Assert.assertEquals("Document model result differs.", expected, result, 0.1e10); } }
From source file:de.unihildesheim.iw.lucene.scoring.clarity.DefaultClarityScoreTest.java
License:Open Source License
@SuppressWarnings("ImplicitNumericConversion") @Test/* w w w.j a va2 s .c om*/ public void testModelHighPrecision_queryModel() throws Exception { try (TestMemIndex idx = new TestMemIndex()) { final IndexDataProvider idp = idx.getIdp(); // query final BytesRef qt1 = new BytesRef("document1"); final BytesRef qt2 = new BytesRef("value"); final BytesRefArray qt = new BytesRefArray(Counter.newCounter(false)); qt.append(qt1); qt.append(qt2); // feedback documents @SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess") final DocIdSet fb = new RoaringDocIdSet.Builder(3).add(0).add(1).add(2).build(); final ModelHighPrecision hpMod = new ModelHighPrecision(idp, BigDecimal.valueOf(LANG_MODEL_WEIGHT), qt, fb); final BytesRef term = new BytesRef("document1"); final double expected = // doc-0: docModel query term (((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(term)) * // doc-0: docModel for all query terms ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt1)) * ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt2))) + // doc-1: docModel query term ((1d - 0.6) * idp.getRelativeTermFrequency(term) * // doc-1: docModel for all query terms ((1d - 0.6) * idp.getRelativeTermFrequency(qt1)) * ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt2))) + // doc-2: docModel query term ((1d - 0.6) * idp.getRelativeTermFrequency(term) * // doc-2: docModel for all query terms ((1d - 0.6) * idp.getRelativeTermFrequency(qt1)) * ((0.6 * (3d / 18d)) + (1d - 0.6) * idp.getRelativeTermFrequency(qt2))); final double result = hpMod.query(term).doubleValue(); Assert.assertEquals("Query model result differs.", expected, result, 0.1e10); } }
From source file:de.unihildesheim.iw.lucene.scoring.clarity.SimplifiedClarityScoreTest.java
License:Open Source License
@SuppressWarnings("UnnecessarilyQualifiedInnerClassAccess") @Test/*from w w w .j av a 2 s. co m*/ public void testCalcScorePortion() throws Exception { try (TestMemIndex idx = new TestMemIndex()) { final SimplifiedClarityScore dcs = new SimplifiedClarityScore.Builder() .analyzer(new WhitespaceAnalyzer()).indexDataProvider(idx.getIdp()).build(); final BytesRef term = new BytesRef("document1"); final long inQueryFreq = 3L; final long queryLength = 8L; final ClarityScoreCalculation.ScoreTupleHighPrecision result = dcs.calcScorePortion(term, inQueryFreq, queryLength); final double expectedQMod = (double) inQueryFreq / (double) queryLength; final double expectedCMod = idx.getIdp().getRelativeTermFrequency(term); Assert.assertEquals("Query model value differs.", expectedQMod, result.qModel.doubleValue(), 0.1e10); Assert.assertEquals("Collection model value differs.", expectedCMod, result.cModel.doubleValue(), 0.1e10); } }
From source file:de.unihildesheim.iw.lucene.util.BytesRefUtilsTest.java
License:Open Source License
@Test public void testCopyBytes() throws Exception { final BytesRef br = new BytesRef("foo"); final BytesRef result = new BytesRef(BytesRefUtils.copyBytes(br)); Assert.assertTrue("Bytes mismatch.", br.bytesEquals(result)); }