List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:org.typo3.solr.search.AccessFilter.java
License:Apache License
/** * This method iterates over the documents and marks documents as accessable that are granted * and have the access information in a single value field. * * @param doc//from ww w. j a va2 s . c o m * @param values * @throws IOException * @return boolean TRUE if access is granted, otherwise FALSE */ private boolean handleSingleValueAccessField(int doc, SortedDocValues values) throws IOException { BytesRef bytes = values.get(doc); String documentGroupList = bytes.utf8ToString(); if (accessGranted(documentGroupList)) { return true; } return false; }
From source file:org.typo3.solr.search.AccessFilter.java
License:Apache License
/** * This method iterates over the documents and marks documents as accessable that are granted * and have the access information in a single value field. * * @param doc//ww w . j av a 2 s.c om * @param multiValueSet * @throws IOException * @return boolean TRUE if access is granted, otherwise FALSE */ private boolean handleMultivalueAccessField(int doc, SortedSetDocValues multiValueSet) throws IOException { long ord; multiValueSet.setDocument(doc); while ((ord = multiValueSet.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { BytesRef bytes = multiValueSet.lookupOrd(ord); String documentGroupList = bytes.utf8ToString(); if (accessGranted(documentGroupList)) { return true; } } return false; }
From source file:org.voyanttools.trombone.lucene.CorpusMapper.java
License:Open Source License
/** * This should not be called, except from the private build() method. * @throws IOException//from w ww .ja va 2 s.co m */ private void buildFromTermsEnum() throws IOException { LeafReader reader = SlowCompositeReaderWrapper .wrap(storage.getLuceneManager().getDirectoryReader(corpus.getId())); Terms terms = reader.terms("id"); TermsEnum termsEnum = terms.iterator(); BytesRef bytesRef = termsEnum.next(); int doc; String id; Set<String> ids = new HashSet<String>(getCorpusDocumentIds()); bitSet = new SparseFixedBitSet(reader.numDocs()); Bits liveBits = reader.getLiveDocs(); while (bytesRef != null) { PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE); doc = postingsEnum.nextDoc(); if (doc != PostingsEnum.NO_MORE_DOCS) { id = bytesRef.utf8ToString(); if (ids.contains(id)) { bitSet.set(doc); luceneIds.add(doc); documentIdToLuceneIdMap.put(id, doc); luceneIdToDocumentIdMap.put(doc, id); } } bytesRef = termsEnum.next(); } this.reader = new FilteredCorpusReader(reader, bitSet); }
From source file:org.voyanttools.trombone.tool.corpus.AbstractContextTerms.java
License:Open Source License
private void fillTermsOfInterest(LeafReader LeafReader, int luceneDoc, Map<Integer, TermInfo> termsOfInterest) throws IOException { // fill in terms of interest Terms terms = LeafReader.getTermVector(luceneDoc, tokenType.name()); TermsEnum termsEnum = terms.iterator(); while (true) { BytesRef term = termsEnum.next(); if (term != null) { String termString = term.utf8ToString(); PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); if (postingsEnum != null) { postingsEnum.nextDoc();//from www .j a va 2s . c o m for (int i = 0, len = postingsEnum.freq(); i < len; i++) { int pos = postingsEnum.nextPosition(); if (termsOfInterest.containsKey(pos)) { termsOfInterest.put(pos, new TermInfo(termString, postingsEnum.startOffset(), postingsEnum.endOffset(), pos, 1)); } } } } else { break; } } }
From source file:org.voyanttools.trombone.tool.corpus.CorpusTerms.java
License:Open Source License
private FlexibleQueue<CorpusTerm> runAllTermsWithDistributionsDocumentTermVectors(CorpusMapper corpusMapper, Keywords stopwords) throws IOException { FlexibleQueue<CorpusTerm> queue = new FlexibleQueue<CorpusTerm>(comparator, start + limit); LeafReader reader = corpusMapper.getLeafReader(); Map<String, Map<Integer, Integer>> rawFreqsMap = new HashMap<String, Map<Integer, Integer>>(); TermsEnum termsEnum = null;// w w w .j av a 2s . co m for (int doc : corpusMapper.getLuceneIds()) { Terms terms = reader.getTermVector(doc, tokenType.name()); if (terms != null) { termsEnum = terms.iterator(); if (termsEnum != null) { BytesRef bytesRef = termsEnum.next(); while (bytesRef != null) { String term = bytesRef.utf8ToString(); if (!stopwords.isKeyword(term)) { if (!rawFreqsMap.containsKey(term)) { rawFreqsMap.put(term, new HashMap<Integer, Integer>()); } int rawF = (int) termsEnum.totalTermFreq(); if (rawF > minRawFreq) { rawFreqsMap.get(term).put(corpusMapper.getDocumentPositionFromLuceneId(doc), rawF); } } bytesRef = termsEnum.next(); } } } } int corpusSize = corpusMapper.getCorpus().size(); int[] tokensCounts = corpusMapper.getCorpus().getTokensCounts(tokenType); int totalCorpusTokens = corpusMapper.getCorpus().getTokensCount(tokenType); int bins = parameters.getParameterIntValue("bins", corpusSize); int[] documentRawFreqs; float[] documentRelativeFreqs; int documentPosition; int termFreq; int freq; for (Map.Entry<String, Map<Integer, Integer>> termsMap : rawFreqsMap.entrySet()) { String termString = termsMap.getKey(); documentRawFreqs = new int[corpusSize]; documentRelativeFreqs = new float[corpusSize]; termFreq = 0; for (Map.Entry<Integer, Integer> docsMap : termsMap.getValue().entrySet()) { documentPosition = docsMap.getKey(); freq = docsMap.getValue(); termFreq += freq; totalTokens += freq; documentRawFreqs[documentPosition] = freq; documentRelativeFreqs[documentPosition] = (float) freq / tokensCounts[documentPosition]; } //total++; if (termFreq > minRawFreq) { CorpusTerm corpusTerm = new CorpusTerm(termString, termFreq, totalCorpusTokens, termsMap.getValue().size(), corpusSize, documentRawFreqs, documentRelativeFreqs, bins); offer(queue, corpusTerm); } // queue.offer(new CorpusTerm(termString, termFreq, totalTokens, termsMap.getValue().size(), corpusSize, documentRawFreqs, documentRelativeFreqs, bins)); } return queue; }
From source file:org.voyanttools.trombone.tool.corpus.DocumentNgrams.java
License:Open Source License
private SimplifiedTermInfo[] getSparseSimplifiedTermInfoArray(CorpusMapper corpusMapper, int luceneDoc, int lastTokenOffset) throws IOException { Keywords stopwords = this.getStopwords(corpusMapper.getCorpus()); Terms terms = corpusMapper.getLeafReader().getTermVector(luceneDoc, tokenType.name()); TermsEnum termsEnum = terms.iterator(); SimplifiedTermInfo[] simplifiedTermInfoArray = new SimplifiedTermInfo[lastTokenOffset + 1]; while (true) { BytesRef term = termsEnum.next(); if (term != null) { String termString = term.utf8ToString(); //if (stopwords.isKeyword(termString)) {continue;} // treat as whitespace or punctuation PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); while (postingsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) { int freq = postingsEnum.freq(); for (int i = 0, len = freq; i < len; i++) { int pos = postingsEnum.nextPosition(); new SimplifiedTermInfo(termString, pos, 1, freq, postingsEnum.startOffset(), postingsEnum.endOffset()); simplifiedTermInfoArray[pos] = freq > 1 ? new SimplifiedTermInfo(termString, pos, 1, freq, postingsEnum.startOffset(), postingsEnum.endOffset()) : new SimplifiedTermInfo(""); // empty string if not repeating }//from ww w . j a v a 2s. com } } else { break; } } return simplifiedTermInfoArray; }
From source file:org.voyanttools.trombone.tool.corpus.DocumentTerms.java
License:Open Source License
private void runAllTermsFromDocumentTermVectors(CorpusMapper corpusMapper, Keywords stopwords) throws IOException { FlexibleQueue<DocumentTerm> queue = new FlexibleQueue<DocumentTerm>(comparator, start + limit); LeafReader reader = corpusMapper.getLeafReader(); Corpus corpus = corpusMapper.getCorpus(); CorpusTermMinimalsDB corpusTermMinimalsDB = CorpusTermMinimalsDB.getInstance(corpusMapper, tokenType); TermsEnum termsEnum = null;//from w w w. j ava 2 s. c o m Bits docIdBitSet = corpusMapper .getBitSetFromDocumentIds(this.getCorpusStoredDocumentIdsFromParameters(corpus)); Bits allBits = new Bits.MatchAllBits(reader.numDocs()); int[] tokenCounts = corpus.getTokensCounts(tokenType); float[] typesCountMeans = corpus.getTypesCountMeans(tokenType); float[] typesCountStdDev = corpus.getTypesCountStdDevs(tokenType); for (int doc : corpusMapper.getLuceneIds()) { if (!docIdBitSet.get(doc)) { continue; } FlexibleQueue<DocumentTerm> docQueue = new FlexibleQueue<DocumentTerm>(comparator, limit * docIdBitSet.length()); int documentPosition = corpusMapper.getDocumentPositionFromLuceneId(doc); String docId = corpusMapper.getDocumentIdFromLuceneId(doc); float mean = typesCountMeans[documentPosition]; float stdDev = typesCountStdDev[documentPosition]; int totalTokensCount = tokenCounts[documentPosition]; Terms terms = reader.getTermVector(doc, tokenType.name()); if (terms != null) { termsEnum = terms.iterator(); if (termsEnum != null) { BytesRef bytesRef = termsEnum.next(); while (bytesRef != null) { String termString = bytesRef.utf8ToString(); if (whiteList.isEmpty() == false && whiteList.isKeyword(termString) == false) { bytesRef = termsEnum.next(); continue; } if (!stopwords.isKeyword(termString)) { CorpusTermMinimal corpusTermMinimal = corpusTermMinimalsDB.get(termString); int[] positions = null; int[] offsets = null; int freq; if (isNeedsPositions || isNeedsOffsets) { PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS); postingsEnum.nextDoc(); freq = postingsEnum.freq(); positions = new int[freq]; offsets = new int[freq]; for (int i = 0; i < freq; i++) { positions[i] = postingsEnum.nextPosition(); offsets[i] = postingsEnum.startOffset(); } } else { freq = (int) termsEnum.totalTermFreq(); } if (freq >= minRawFreq) { total++; float zscore = stdDev != 0 ? ((freq - mean) / stdDev) : Float.NaN; DocumentTerm documentTerm = new DocumentTerm(documentPosition, docId, termString, freq, totalTokensCount, zscore, positions, offsets, corpusTermMinimal); docQueue.offer(documentTerm); } } bytesRef = termsEnum.next(); } } } int i = 0; for (DocumentTerm docTerm : docQueue.getOrderedList()) { queue.offer(docTerm); if (++i >= perDocLimit) { break; } } } corpusTermMinimalsDB.close(); this.terms.addAll(queue.getOrderedList(start)); }
From source file:org.zenoss.zep.index.impl.lucene.LuceneQueryBuilder.java
License:Open Source License
private static Term[] getMatchingTerms(String fieldName, IndexReader reader, String value) throws ZepException { // Don't search for matches if text doesn't contain wildcards if (value.indexOf('*') == -1 && value.indexOf('?') == -1) return new Term[] { new Term(fieldName, value) }; logger.debug("getMatchingTerms: field={}, value={}", fieldName, value); List<Term> matches = new ArrayList<Term>(); Automaton automaton = WildcardQuery.toAutomaton(new Term(fieldName, value)); CompiledAutomaton compiled = new CompiledAutomaton(automaton); try {/*from w w w. j ava 2s . c om*/ Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(fieldName); TermsEnum wildcardTermEnum = compiled.getTermsEnum(terms); BytesRef match; while (wildcardTermEnum.next() != null) { match = wildcardTermEnum.term(); logger.debug("Match: {}", match); matches.add(new Term(fieldName, match.utf8ToString())); } return matches.toArray(new Term[matches.size()]); } catch (IOException e) { throw new ZepException(e.getLocalizedMessage(), e); } }
From source file:perf.ShowFields.java
License:Apache License
public static void main(String[] args) throws CorruptIndexException, IOException { DirectoryReader reader = DirectoryReader.open(FSDirectory.open( new File("/home/simon/work/projects/lucene/bench/indices/Standard.work.trunk.wiki.nd0.1M/index"))); Fields fields = MultiFields.getFields(reader); for (String name : fields) { System.out.println(name); if (name.equals("docdate")) { TermsEnum terms = fields.terms(name).iterator(null); BytesRef ref; int i = 0; while ((ref = terms.next()) != null) { System.out.println(ref.utf8ToString()); if (i++ == 10) { break; }/* ww w. j a va2s. co m*/ } } } }
From source file:pretraga.IsolationSimilarity.java
public void test(String vec) { List<String> vector = processInput(vec); HashMap<String, Long> map = new HashMap<>(); try {/*from ww w . ja v a 2 s . c om*/ Directory dir = FSDirectory.open(new File(indexDirectoryPath).toPath()); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); List<Integer> docId = getDocumentsFromVector(vector, reader, searcher); for (int i = 0; i < docId.size(); i++) { Fields ff = reader.getTermVectors(docId.get(i)); Terms terms = ff.terms(CONTENT); TermsEnum te = terms.iterator(); Object tmp = te.next(); while (tmp != null) { BytesRef by = (BytesRef) tmp; String term = by.utf8ToString(); ClassicSimilarity sim = null; if (searcher.getSimilarity(true) instanceof ClassicSimilarity) { sim = (ClassicSimilarity) searcher.getSimilarity(true); } float idf = sim.idf(te.docFreq(), reader.maxDoc()); float tf = sim.tf(te.totalTermFreq()); //System.out.println("idf = " + idf + ", tf = " + tf + ", docF: " + te.totalTermFreq()); TermStatistics ts = new TermStatistics(by, te.docFreq(), te.totalTermFreq()); CollectionStatistics s = new CollectionStatistics(CONTENT, reader.maxDoc(), terms.getDocCount(), terms.getSumTotalTermFreq(), terms.getSumDocFreq()); Document d = reader.document(docId.get(i)); if (vector.contains(term)) { float ttt = sim.simScorer(sim.computeWeight(s, ts), reader.getContext().leaves().get(0)) .score(docId.get(i), te.totalTermFreq()); System.out.println(ttt + ", " + d.get(TITLE) + ", term: " + term); } tmp = te.next(); } /*Iterator<String> ss = ff.iterator(); while (ss.hasNext()) { String fieldString = ss.next(); System.out.println(fieldString); }*/ } } catch (Exception e) { } }