List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:com.github.flaxsearch.api.PositionData.java
License:Apache License
static String payloadToString(BytesRef payload) { if (payload == null) return null; return payload.utf8ToString(); }
From source file:com.joliciel.jochre.search.highlight.LuceneQueryHighlighter.java
License:Open Source License
public Map<Integer, Set<HighlightTerm>> highlight(Set<Integer> docIds, Set<String> fields) { try {/*from www . j av a2 s. c o m*/ Map<Integer, Set<HighlightTerm>> termMap = new HashMap<Integer, Set<HighlightTerm>>(); Map<Integer, Document> idToDocMap = new HashMap<Integer, Document>(); Map<Integer, CoordinateStorage> idToCoordinateStorageMap = new HashMap<Integer, CoordinateStorage>(); Map<Integer, Set<Integer>> myLeaves = new HashMap<Integer, Set<Integer>>(); for (int docId : docIds) { Document luceneDoc = indexSearcher.doc(docId); idToDocMap.put(docId, luceneDoc); JochreIndexDocument jochreDoc = searchService.getJochreIndexDocument(indexSearcher, docId); idToCoordinateStorageMap.put(docId, jochreDoc.getCoordinateStorage()); termMap.put(docId, new TreeSet<HighlightTerm>()); int leaf = ReaderUtil.subIndex(docId, leaves); Set<Integer> docsPerLeaf = myLeaves.get(leaf); if (docsPerLeaf == null) { docsPerLeaf = new HashSet<Integer>(); myLeaves.put(leaf, docsPerLeaf); } docsPerLeaf.add(docId); } for (int leaf : myLeaves.keySet()) { if (LOG.isTraceEnabled()) LOG.trace("Searching leaf " + leaf); Set<Integer> docsPerLeaf = myLeaves.get(leaf); AtomicReaderContext subContext = leaves.get(leaf); AtomicReader atomicReader = subContext.reader(); int fieldCounter = 0; for (String field : fields) { fieldCounter++; if (LOG.isTraceEnabled()) LOG.trace("Field " + fieldCounter + ": " + field); Terms atomicReaderTerms = atomicReader.terms(field); if (atomicReaderTerms == null) { continue; // nothing to do } TermsEnum termsEnum = atomicReaderTerms.iterator(TermsEnum.EMPTY); int termCounter = 0; for (BytesRef term : terms) { termCounter++; if (LOG.isTraceEnabled()) LOG.trace("Searching for term " + termCounter + ": " + term.utf8ToString() + " in field " + field); if (!termsEnum.seekExact(term)) { continue; // term not found } DocsAndPositionsEnum docPosEnum = termsEnum.docsAndPositions(null, null, DocsAndPositionsEnum.FLAG_OFFSETS); int relativeDocId = docPosEnum.nextDoc(); while (relativeDocId != DocsAndPositionsEnum.NO_MORE_DOCS) { int docId = subContext.docBase + relativeDocId; if (docsPerLeaf.contains(docId)) { Document doc = idToDocMap.get(docId); Set<HighlightTerm> highlightTerms = termMap.get(docId); //Retrieve the term frequency in the current document int freq = docPosEnum.freq(); if (LOG.isTraceEnabled()) { String extId = doc.get("id"); String path = doc.get("path"); LOG.trace("Found " + freq + " matches for doc " + docId + ", extId: " + extId + ", path: " + path); } for (int i = 0; i < freq; i++) { int position = docPosEnum.nextPosition(); int start = docPosEnum.startOffset(); int end = docPosEnum.endOffset(); if (LOG.isTraceEnabled()) LOG.trace("Found match " + position + " at docId " + docId + ", field " + field + " start=" + start + ", end=" + end); CoordinateStorage coordinateStorage = idToCoordinateStorageMap.get(docId); int imageIndex = coordinateStorage.getImageIndex(start); int pageIndex = coordinateStorage.getPageIndex(start); HighlightTerm highlightTerm = new HighlightTerm(docId, field, start, end, imageIndex, pageIndex); highlightTerm.setWeight(this.weigh(term)); if (highlightTerm.getWeight() > 0) highlightTerms.add(highlightTerm); } } relativeDocId = docPosEnum.nextDoc(); } } // next term } // next field } // next index leaf to search return termMap; } catch (IOException e) { LogUtils.logError(LOG, e); throw new RuntimeException(e); } }
From source file:com.mathworks.xzheng.advsearching.BooksLikeThis.java
License:Apache License
public Document[] docsLike(int id, int max) throws IOException { Document doc = reader.document(id); String[] authors = doc.getValues("author"); BooleanQuery authorQuery = new BooleanQuery(); // #3 for (String author : authors) { // #3 authorQuery.add(new TermQuery(new Term("author", author)), // #3 BooleanClause.Occur.SHOULD); // #3 }/* w w w .j a v a2 s.c o m*/ authorQuery.setBoost(2.0f); Terms terms = reader.getTermVector(id, "subject"); // #4 BooleanQuery subjectQuery = new BooleanQuery(); // #4 TermsEnum termsEnum = terms.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { // #4 TermQuery tq = new TermQuery( // #4 new Term("subject", text.utf8ToString())); // #4 subjectQuery.add(tq, BooleanClause.Occur.SHOULD); // #4 } BooleanQuery likeThisQuery = new BooleanQuery(); // #5 likeThisQuery.add(authorQuery, BooleanClause.Occur.SHOULD); // #5 likeThisQuery.add(subjectQuery, BooleanClause.Occur.SHOULD); // #5 likeThisQuery.add(new TermQuery( // #6 new Term("isbn", doc.get("isbn"))), BooleanClause.Occur.MUST_NOT); // #6 // System.out.println(" Query: " + // likeThisQuery.toString("contents")); TopDocs hits = searcher.search(likeThisQuery, 10); int size = max; if (max > hits.scoreDocs.length) size = hits.scoreDocs.length; Document[] docs = new Document[size]; for (int i = 0; i < size; i++) { docs[i] = reader.document(hits.scoreDocs[i].doc); } return docs; }
From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java
License:Apache License
/** * {@inheritDoc}//from w w w .j a v a2 s . c om */ @Override public void train(LeafReader leafReader, String textFieldName, String classFieldName, Analyzer analyzer, Query query) throws IOException { this.textTerms = MultiFields.getTerms(leafReader, textFieldName); if (textTerms == null) { throw new IOException("term vectors need to be available for field " + textFieldName); } this.analyzer = analyzer; this.textFieldName = textFieldName; if (threshold == null || threshold == 0d) { // automatic assign a threshold long sumDocFreq = leafReader.getSumDocFreq(textFieldName); if (sumDocFreq != -1) { this.threshold = (double) sumDocFreq / 2d; } else { throw new IOException("threshold cannot be assigned since term vectors for field " + textFieldName + " do not exist"); } } // TODO : remove this map as soon as we have a writable FST SortedMap<String, Double> weights = new TreeMap<>(); TermsEnum termsEnum = textTerms.iterator(); BytesRef textTerm; while ((textTerm = termsEnum.next()) != null) { weights.put(textTerm.utf8ToString(), (double) termsEnum.totalTermFreq()); } updateFST(weights); IndexSearcher indexSearcher = new IndexSearcher(leafReader); int batchCount = 0; BooleanQuery q = new BooleanQuery(); q.add(new BooleanClause(new WildcardQuery(new Term(classFieldName, "*")), BooleanClause.Occur.MUST)); if (query != null) { q.add(new BooleanClause(query, BooleanClause.Occur.MUST)); } // run the search and use stored field values for (ScoreDoc scoreDoc : indexSearcher.search(q, Integer.MAX_VALUE).scoreDocs) { Document doc = indexSearcher.doc(scoreDoc.doc); IndexableField textField = doc.getField(textFieldName); // get the expected result IndexableField classField = doc.getField(classFieldName); if (textField != null && classField != null) { // assign class to the doc ClassificationResult<Boolean> classificationResult = assignClass(textField.stringValue()); Boolean assignedClass = classificationResult.getAssignedClass(); Boolean correctClass = Boolean.valueOf(classField.stringValue()); long modifier = correctClass.compareTo(assignedClass); if (modifier != 0) { updateWeights(leafReader, scoreDoc.doc, assignedClass, weights, modifier, batchCount % batchSize == 0); } batchCount++; } } weights.clear(); // free memory while waiting for GC }
From source file:com.meizu.nlp.classification.BooleanPerceptronClassifier.java
License:Apache License
private void updateWeights(LeafReader leafReader, int docId, Boolean assignedClass, SortedMap<String, Double> weights, double modifier, boolean updateFST) throws IOException { TermsEnum cte = textTerms.iterator(); // get the doc term vectors Terms terms = leafReader.getTermVector(docId, textFieldName); if (terms == null) { throw new IOException("term vectors must be stored for field " + textFieldName); }/*from w ww. ja v a 2 s. co m*/ TermsEnum termsEnum = terms.iterator(); BytesRef term; while ((term = termsEnum.next()) != null) { cte.seekExact(term); if (assignedClass != null) { long termFreqLocal = termsEnum.totalTermFreq(); // update weights Long previousValue = Util.get(fst, term); String termString = term.utf8ToString(); weights.put(termString, previousValue + modifier * termFreqLocal); } } if (updateFST) { updateFST(weights); } }
From source file:com.meizu.nlp.classification.CachingNaiveBayesClassifier.java
License:Apache License
/** * This function is building the frame of the cache. The cache is storing the * word occurrences to the memory after those searched once. This cache can * made 2-100x speedup in proper use, but can eat lot of memory. There is an * option to lower the memory consume, if a word have really low occurrence in * the index you could filter it out. The other parameter is switching between * the term searching, if it true, just the terms in the skeleton will be * searched, but if it false the terms whoes not in the cache will be searched * out too (but not cached).//from www. j a v a 2 s .co m * * @param minTermOccurrenceInCache Lower cache size with higher value. * @param justCachedTerms The switch for fully exclude low occurrence docs. * @throws IOException If there is a low-level I/O error. */ public void reInitCache(int minTermOccurrenceInCache, boolean justCachedTerms) throws IOException { this.justCachedTerms = justCachedTerms; this.docsWithClassSize = countDocsWithClass(); termCClassHitCache.clear(); cclasses.clear(); classTermFreq.clear(); // build the cache for the word Map<String, Long> frequencyMap = new HashMap<>(); for (String textFieldName : textFieldNames) { TermsEnum termsEnum = leafReader.terms(textFieldName).iterator(); while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); String termText = term.utf8ToString(); long frequency = termsEnum.docFreq(); Long lastfreq = frequencyMap.get(termText); if (lastfreq != null) frequency += lastfreq; frequencyMap.put(termText, frequency); } } for (Map.Entry<String, Long> entry : frequencyMap.entrySet()) { if (entry.getValue() > minTermOccurrenceInCache) { termCClassHitCache.put(entry.getKey(), new ConcurrentHashMap<BytesRef, Integer>()); } } // fill the class list Terms terms = MultiFields.getTerms(leafReader, classFieldName); TermsEnum termsEnum = terms.iterator(); while ((termsEnum.next()) != null) { cclasses.add(BytesRef.deepCopyOf(termsEnum.term())); } // fill the classTermFreq map for (BytesRef cclass : cclasses) { double avgNumberOfUniqueTerms = 0; for (String textFieldName : textFieldNames) { terms = MultiFields.getTerms(leafReader, textFieldName); long numPostings = terms.getSumDocFreq(); // number of term/doc pairs avgNumberOfUniqueTerms += numPostings / (double) terms.getDocCount(); } int docsWithC = leafReader.docFreq(new Term(classFieldName, cclass)); classTermFreq.put(cclass, avgNumberOfUniqueTerms * docsWithC); } }
From source file:com.meltwater.elasticsearch.shard.BatchQueriesLoaderCollector.java
License:Apache License
@Override public void collect(int doc) throws IOException { idValues.setDocument(doc);/*from w w w . j a va 2 s . com*/ if (idValues.count() > 0) { assert idValues.count() == 1; BytesRef id = idValues.valueAt(0); fieldsVisitor.reset(); reader.document(doc, fieldsVisitor); try { // id is only used for logging, if we fail we log the id in the catch statement final QueryAndSource queryAndSource = percolator.parsePercolatorDocument(null, fieldsVisitor.source()); queries.put(id.utf8ToString(), queryAndSource); } catch (Exception e) { logger.warn("failed to add query [{}]", e, id.utf8ToString()); } } else { logger.error("failed to load query since field [{}] not present", ID_FIELD); } }
From source file:com.rocana.lucene.codec.v1.RocanaBasePostingsFormatTestCase.java
License:Apache License
@Override public void testInvertedWrite() throws Exception { Directory dir = newDirectory();// w w w .j av a 2s . c o m MockAnalyzer analyzer = new MockAnalyzer(random()); analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH)); IndexWriterConfig iwc = newIndexWriterConfig(analyzer); // Must be concurrent because thread(s) can be merging // while up to one thread flushes, and each of those // threads iterates over the map while the flushing // thread might be adding to it: final Map<String, TermFreqs> termFreqs = new ConcurrentHashMap<>(); final AtomicLong sumDocFreq = new AtomicLong(); final AtomicLong sumTotalTermFreq = new AtomicLong(); // TODO: would be better to use / delegate to the current // Codec returned by getCodec() iwc.setCodec(new AssertingCodec() { @Override public PostingsFormat getPostingsFormatForField(String field) { PostingsFormat p = getCodec().postingsFormat(); if (p instanceof PerFieldPostingsFormat) { p = ((PerFieldPostingsFormat) p).getPostingsFormatForField(field); } if (p instanceof RocanaPerFieldPostingsFormat) { p = ((RocanaPerFieldPostingsFormat) p).getPostingsFormatForField(field); } final PostingsFormat defaultPostingsFormat = p; final Thread mainThread = Thread.currentThread(); if (field.equals("body")) { // A PF that counts up some stats and then in // the end we verify the stats match what the // final IndexReader says, just to exercise the // new freedom of iterating the postings more // than once at flush/merge: return new PostingsFormat(defaultPostingsFormat.getName()) { @Override public FieldsConsumer fieldsConsumer(final SegmentWriteState state) throws IOException { final FieldsConsumer fieldsConsumer = defaultPostingsFormat.fieldsConsumer(state); return new FieldsConsumer() { @Override public void write(Fields fields) throws IOException { fieldsConsumer.write(fields); boolean isMerge = state.context.context == IOContext.Context.MERGE; // We only use one thread for flushing // in this test: assert isMerge || Thread.currentThread() == mainThread; // We iterate the provided TermsEnum // twice, so we excercise this new freedom // with the inverted API; if // addOnSecondPass is true, we add up // term stats on the 2nd iteration: boolean addOnSecondPass = random().nextBoolean(); //System.out.println("write isMerge=" + isMerge + " 2ndPass=" + addOnSecondPass); // Gather our own stats: Terms terms = fields.terms("body"); assert terms != null; TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } String termString = term.utf8ToString(); // During merge we should only see terms // we had already seen during a // previous flush: assertTrue(isMerge == false || termFreqs.containsKey(termString)); if (isMerge == false) { if (addOnSecondPass == false) { TermFreqs tf = termFreqs.get(termString); if (tf == null) { tf = new TermFreqs(); termFreqs.put(termString, tf); } tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } else if (termFreqs.containsKey(termString) == false) { // Add placeholder (2nd pass will // set its counts): termFreqs.put(termString, new TermFreqs()); } } } // Also test seeking the TermsEnum: for (String term : termFreqs.keySet()) { if (termsEnum.seekExact(new BytesRef(term))) { // TODO: also sometimes ask for payloads/offsets? boolean noPositions = random().nextBoolean(); if (noPositions) { docs = termsEnum.postings(docs, PostingsEnum.FREQS); } else { docs = termsEnum.postings(null, PostingsEnum.POSITIONS); } int docFreq = 0; long totalTermFreq = 0; while (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { docFreq++; totalTermFreq += docs.freq(); int limit = TestUtil.nextInt(random(), 1, docs.freq()); if (!noPositions) { for (int i = 0; i < limit; i++) { docs.nextPosition(); } } } if (isMerge == false && addOnSecondPass) { TermFreqs tf = termFreqs.get(term); assert tf != null; tf.docFreq += docFreq; tf.totalTermFreq += totalTermFreq; sumDocFreq.addAndGet(docFreq); sumTotalTermFreq.addAndGet(totalTermFreq); } //System.out.println(" term=" + term + " docFreq=" + docFreq + " ttDF=" + termToDocFreq.get(term)); assertTrue(docFreq <= termFreqs.get(term).docFreq); assertTrue(totalTermFreq <= termFreqs.get(term).totalTermFreq); } } // Also test seekCeil for (int iter = 0; iter < 10; iter++) { BytesRef term = new BytesRef( TestUtil.randomRealisticUnicodeString(random())); SeekStatus status = termsEnum.seekCeil(term); if (status == SeekStatus.NOT_FOUND) { assertTrue(term.compareTo(termsEnum.term()) < 0); } } } @Override public void close() throws IOException { fieldsConsumer.close(); } }; } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { return defaultPostingsFormat.fieldsProducer(state); } }; } else { return defaultPostingsFormat; } } }); RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); LineFileDocs docs = new LineFileDocs(random()); int bytesToIndex = atLeast(100) * 1024; int bytesIndexed = 0; while (bytesIndexed < bytesToIndex) { Document doc = docs.nextDoc(); w.addDocument(doc); bytesIndexed += RamUsageTester.sizeOf(doc); } IndexReader r = w.getReader(); w.close(); Terms terms = MultiFields.getTerms(r, "body"); assertEquals(sumDocFreq.get(), terms.getSumDocFreq()); assertEquals(sumTotalTermFreq.get(), terms.getSumTotalTermFreq()); TermsEnum termsEnum = terms.iterator(); long termCount = 0; boolean supportsOrds = true; while (termsEnum.next() != null) { BytesRef term = termsEnum.term(); assertEquals(termFreqs.get(term.utf8ToString()).docFreq, termsEnum.docFreq()); assertEquals(termFreqs.get(term.utf8ToString()).totalTermFreq, termsEnum.totalTermFreq()); if (supportsOrds) { long ord; try { ord = termsEnum.ord(); } catch (UnsupportedOperationException uoe) { supportsOrds = false; ord = -1; } if (ord != -1) { assertEquals(termCount, ord); } } termCount++; } assertEquals(termFreqs.size(), termCount); r.close(); dir.close(); }
From source file:com.rocana.lucene.codec.v1.RocanaBlockTreeTermsReader.java
License:Apache License
String brToString(BytesRef b) { if (b == null) { return "null"; } else {/*from w w w .ja v a 2 s . c om*/ try { return b.utf8ToString() + " " + b; } catch (Throwable t) { // If BytesRef isn't actually UTF8, or it's eg a // prefix of UTF8 that ends mid-unicode-char, we // fallback to hex: return b.toString(); } } }
From source file:com.rocana.lucene.codec.v1.RocanaIntersectTermsEnum.java
License:Apache License
@SuppressWarnings("unused") static String brToString(BytesRef b) { try {//from w w w . j a va 2 s . c o m return b.utf8ToString() + " " + b; } catch (Throwable t) { // If BytesRef isn't actually UTF8, or it's eg a // prefix of UTF8 that ends mid-unicode-char, we // fallback to hex: return b.toString(); } }