List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:org.meresco.lucene.suggestion.SuggestionNGramIndex.java
License:Open Source License
public void createSuggestions(IndexReader reader, String suggestionFieldname, String keyFieldname, IndexingState indexingState) throws IOException { Bits liveDocs = MultiFields.getLiveDocs(reader); List<AtomicReaderContext> leaves = reader.leaves(); Terms terms = MultiFields.getTerms(reader, suggestionFieldname); if (terms == null) return;/*from w w w . j a v a 2s. c o m*/ TermsEnum termsEnum = terms.iterator(null); BytesRef term; while ((term = termsEnum.next()) != null) { List<Long> keys = new ArrayList<>(); DocsEnum docsEnum = termsEnum.docs(liveDocs, null, DocsEnum.FLAG_NONE); while (true) { int docId = docsEnum.nextDoc(); if (docId == DocsEnum.NO_MORE_DOCS) { break; } keys.add(keyForDoc(docId, leaves, keyFieldname)); } if (keys.size() > 0) { String[] values = term.utf8ToString().split(SuggestionIndex.CONCAT_MARKER.replace("$", "\\$")); indexNGram(values[0], values[1], values[2], keys); indexingState.count++; } } this.commit(); }
From source file:org.neo4j.kernel.api.impl.schema.sampler.NonUniqueLuceneIndexSampler.java
License:Open Source License
@Override protected IndexSample performSampling() throws IndexNotFoundKernelException { NonUniqueIndexSampler sampler = new NonUniqueIndexSampler(indexSamplingConfig.sampleSizeLimit()); IndexReader indexReader = indexSearcher.getIndexReader(); for (LeafReaderContext readerContext : indexReader.leaves()) { try {/*from w ww . ja v a 2 s . c om*/ Set<String> fieldNames = getFieldNamesToSample(readerContext); for (String fieldName : fieldNames) { Terms terms = readerContext.reader().terms(fieldName); if (terms != null) { TermsEnum termsEnum = LuceneDocumentStructure.originalTerms(terms, fieldName); BytesRef termsRef; while ((termsRef = termsEnum.next()) != null) { sampler.include(termsRef.utf8ToString(), termsEnum.docFreq()); checkCancellation(); } } } } catch (IOException e) { throw new RuntimeException(e); } } return sampler.result(indexReader.numDocs()); }
From source file:org.nlp4l.lucene.BuddyWordsFinder.java
License:Apache License
public Scorer[] findCoincidentalTerms(String field, BytesRef term) throws IOException { baseTermFilter.start(reader, field, term); if (baseTermFilter.skip(term) || baseTermFilter.skipByPopularity(term)) return null; //System.out.println(term.utf8ToString()); Bits liveDocs = MultiFields.getLiveDocs(reader); PostingsEnum de = MultiFields.getTermDocsEnum(reader, field, term); if (de == null) return null; int numDocsAnalyzed = 0; phraseTerms.clear();/*w w w . ja v a 2 s. c o m*/ while (de.nextDoc() != PostingsEnum.NO_MORE_DOCS && numDocsAnalyzed < maxDocsToAnalyze) { int docId = de.docID(); //first record all of the positions of the term in a bitset which // represents terms in the current doc. int freq = de.freq(); PostingsEnum pe = MultiFields.getTermPositionsEnum(reader, field, term); int ret = pe.advance(docId); if (ret == PostingsEnum.NO_MORE_DOCS) continue; termPos.clear(); for (int i = 0; i < freq; i++) { int pos = pe.nextPosition(); if (pos < termPos.size()) termPos.set(pos); } // now look at all OTHER terms in this doc and see if they are // positioned in a pre-defined sized window around the current term Fields vectors = reader.getTermVectors(docId); // check it has term vectors if (vectors == null) return null; Terms vector = vectors.terms(field); // check it has position info if (vector == null || !vector.hasPositions()) return null; TermsEnum te = vector.iterator(); BytesRef otherTerm = null; while ((otherTerm = te.next()) != null) { if (term.bytesEquals(otherTerm)) continue; coiTermFilter.start(reader, field, otherTerm); if (coiTermFilter.skip(otherTerm) || coiTermFilter.skipByPopularity(otherTerm)) continue; PostingsEnum pe2 = MultiFields.getTermPositionsEnum(reader, field, otherTerm); ret = pe2.advance(docId); if (ret == PostingsEnum.NO_MORE_DOCS) continue; freq = pe2.freq(); boolean matchFound = false; for (int i = 0; i < freq && (!matchFound); i++) { int pos = pe2.nextPosition(); int startpos = Math.max(0, pos - slop); int endpos = pos + slop; for (int prevpos = startpos; (prevpos <= endpos) && (!matchFound); prevpos++) { if (termPos.get(prevpos)) { // Add term to hashmap containing co-occurence // counts for this term Scorer pt = phraseTerms.get(otherTerm.utf8ToString()); if (pt == null) { pt = new Scorer(baseTermFilter.getCurrentTermDocFreq(), otherTerm.utf8ToString(), coiTermFilter.getCurrentTermDocFreq()); phraseTerms.put(pt.coiTerm, pt); } pt.incCoiDocCount(); matchFound = true; } } } } numDocsAnalyzed++; } // end of while loop // now sort and dump the top terms associated with this term. TopTerms topTerms = new TopTerms(maxCoiTermsPerTerm); for (String key : phraseTerms.keySet()) { Scorer pt = phraseTerms.get(key); topTerms.insertWithOverflow(pt); } Scorer[] tops = new Scorer[topTerms.size()]; int tp = tops.length - 1; while (topTerms.size() > 0) { Scorer top = topTerms.pop(); tops[tp--] = top; } return tops; }
From source file:org.nlp4l.lucene.BuddyWordsFinderTermFilter.java
License:Apache License
protected boolean skip(BytesRef term) throws IOException { return skip(term.utf8ToString()); }
From source file:org.nlp4l.lucene.ConcatFreqLRCompoundNounScorer.java
License:Apache License
protected double getConcatenatedNounScore(String fieldName, String noun) throws IOException { Terms terms = MultiFields.getTerms(reader, fieldName); TermsEnum te = terms.iterator();//from w ww. jav a 2s . co m te.seekCeil(new BytesRef(noun)); BytesRef text = te.term(); int count = 0; do { if (text == null || !text.utf8ToString().startsWith(noun)) break; count += te.totalTermFreq(); } while ((text = te.next()) != null); return count; }
From source file:org.nlp4l.lucene.ConcatTypeCountLRCompoundNounScorer.java
License:Apache License
protected double getConcatenatedNounScore(String fieldName, String noun) throws IOException { Terms terms = MultiFields.getTerms(reader, fieldName); TermsEnum te = terms.iterator();// ww w .jav a2s. co m te.seekCeil(new BytesRef(noun)); BytesRef text = te.term(); int count = 0; do { if (text == null || !text.utf8ToString().startsWith(noun)) break; count++; } while ((text = te.next()) != null); return count; }
From source file:org.nlp4l.lucene.LuceneDocTermVector.java
License:Apache License
/** * Lucene????//from www.j a v a2s . com * * @param reader ??Lucene???{@link IndexReader} * @param docId ???LuceneID * @param fieldName ???Lucene?? * @param size ? * @param termsReuse null????? * @param liveDocs null????? * @param twf null???{@link DefaultTfIdfTermWeightFactory}??? * @param stopWords ??????????????????null???? * @throws IOException */ public LuceneDocTermVector(IndexReader reader, int docId, String fieldName, int size, Terms termsReuse, Bits liveDocs, TermWeightFactory twf, Set<String> stopWords) throws IOException { liveDocs = liveDocs == null ? MultiFields.getLiveDocs(reader) : liveDocs; twf = twf == null ? new DefaultTfIdfTermWeightFactory(reader, docId, fieldName, liveDocs) : twf; queue = new TermWeightQueue(size); if (termsReuse == null) termsReuse = reader.getTermVector(docId, fieldName); TermsEnum termsEnum = termsReuse.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { // candidate feature term final String term = text.utf8ToString(); if (stopWords != null && stopWords.contains(term)) continue; final TermWeight termWeight = twf.create(text); if (termWeight == null) continue; Map.Entry<String, TermWeight> entry = new Map.Entry<String, TermWeight>() { public String getKey() { return term; } public TermWeight getValue() { return termWeight; } public TermWeight setValue(TermWeight arg0) { // TODO Auto-generated method stub return null; } }; queue.insertWithOverflow(entry); } }
From source file:org.nlp4l.lucene.TermsExtractor.java
License:Apache License
public void execute() throws IOException { try {// w ww .ja v a 2 s . c o m init(); Terms terms = MultiFields.getTerms(reader, fieldNameCn); TermsEnum te = terms.iterator(); BytesRef text = null; LuceneDocTermVector.TermWeightQueue queue = new LuceneDocTermVector.TermWeightQueue(outNum); int count = 0; while ((text = te.next()) != null) { /* if(count % 5000 == 0){ logger.printTime(count); } */ final String term = text.utf8ToString(); // http://rondhuit-dev.com/trac/projects/ticket/184 if (P_SPACES.matcher(term).find()) continue; final LuceneDocTermVector.TermWeight termWeight = new TermScore((float) scorer.score(term)); Map.Entry<String, LuceneDocTermVector.TermWeight> entry = new Map.Entry<String, LuceneDocTermVector.TermWeight>() { public String getKey() { return term; } public LuceneDocTermVector.TermWeight getValue() { return termWeight; } public LuceneDocTermVector.TermWeight setValue(LuceneDocTermVector.TermWeight arg0) { // TODO Auto-generated method stub return null; } }; queue.insertWithOverflow(entry); count++; } //logger.log("number of compound nouns is %d\n", count); printQueue(queue); } finally { try { if (reader != null) reader.close(); } catch (IOException e) { } IOUtils.closeQuietly(pw); } }
From source file:org.ohdsi.usagi.tests.TestLucene.java
License:Apache License
public static void main(String[] args) throws IOException, ParseException { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9); //Analyzer analyzer = new UsagiAnalyzer(); FieldType textVectorField = new FieldType(); textVectorField.setIndexed(true);/* w w w . j a v a2 s .co m*/ textVectorField.setTokenized(true); textVectorField.setStoreTermVectors(true); textVectorField.setStoreTermVectorPositions(false); textVectorField.setStoreTermVectorPayloads(false); textVectorField.setStoreTermVectorOffsets(false); textVectorField.setStored(true); textVectorField.freeze(); File indexFolder = new File(folder); if (indexFolder.exists()) DirectoryUtilities.deleteDir(indexFolder); Directory dir = FSDirectory.open(indexFolder); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); iwc.setOpenMode(OpenMode.CREATE); iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); Document doc = new Document(); doc.add(new Field("F", "word1 word2 w3 word4", textVectorField)); writer.addDocument(doc); doc = new Document(); doc.add(new Field("F", "word1 word2 w3", textVectorField)); writer.addDocument(doc); writer.close(); IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(folder))); for (int i = 0; i < reader.numDocs(); i++) { TermsEnum termsEnum = reader.getTermVector(i, "F").iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { System.out.print(text.utf8ToString() + ","); } System.out.println(); } IndexSearcher searcher = new IndexSearcher(reader); // MoreLikeThis mlt = new MoreLikeThis(searcher.getIndexReader()); // mlt.setMinTermFreq(0); // mlt.setMinDocFreq(0); // mlt.setMaxDocFreq(9999); // mlt.setMinWordLen(0); // mlt.setMaxWordLen(9999); // mlt.setMaxDocFreqPct(100); // mlt.setMaxNumTokensParsed(9999); // mlt.setMaxQueryTerms(9999); // mlt.setStopWords(null); // mlt.setFieldNames(new String[] { "F" }); // mlt.setAnalyzer(new UsagiAnalyzer()); // Query query = mlt.like("F", new StringReader("Systolic blood pressure")); QueryParser parser = new QueryParser(Version.LUCENE_4_9, "F", analyzer); Query query = parser.parse("word1"); Explanation explanation = searcher.explain(query, 0); print(explanation); System.out.println(); explanation = searcher.explain(query, 1); print(explanation); System.out.println(); TopDocs topDocs = searcher.search(query, 99); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { System.out.println(scoreDoc.score + "\t" + reader.document(scoreDoc.doc).get("F")); } }
From source file:org.opengrok.suggest.query.SuggesterRangeQuery.java
License:Open Source License
/** {@inheritDoc} */ @Override/*from ww w . j a v a2 s. c o m*/ public int length() { BytesRef prefix = getPrefix(); if (prefix == null) { return 0; } return prefix.utf8ToString().length(); }