Example usage for org.apache.lucene.util BytesRef utf8ToString

List of usage examples for org.apache.lucene.util BytesRef utf8ToString

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef utf8ToString.

Prototype

public String utf8ToString() 

Source Link

Document

Interprets stored bytes as UTF8 bytes, returning the resulting string

Usage

From source file:org.typo3.solr.search.AccessFilter.java

License:Apache License

/**
 * This method iterates over the documents and marks documents as accessable that are granted
 * and have the access information in a single value field.
 *
 * @param doc//from   ww w.  j  a va2  s  .  c o  m
 * @param values
 * @throws IOException
 * @return boolean TRUE if access is granted, otherwise FALSE
   */

private boolean handleSingleValueAccessField(int doc, SortedDocValues values) throws IOException {

    BytesRef bytes = values.get(doc);
    String documentGroupList = bytes.utf8ToString();

    if (accessGranted(documentGroupList)) {
        return true;
    }
    return false;
}

From source file:org.typo3.solr.search.AccessFilter.java

License:Apache License

/**
 * This method iterates over the documents and marks documents as accessable that are granted
 * and have the access information in a single value field.
 *
 * @param doc//ww  w  .  j av a 2  s.c  om
 * @param multiValueSet
 * @throws IOException
 * @return boolean TRUE if access is granted, otherwise FALSE
   */
private boolean handleMultivalueAccessField(int doc, SortedSetDocValues multiValueSet) throws IOException {
    long ord;
    multiValueSet.setDocument(doc);

    while ((ord = multiValueSet.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
        BytesRef bytes = multiValueSet.lookupOrd(ord);
        String documentGroupList = bytes.utf8ToString();

        if (accessGranted(documentGroupList)) {
            return true;
        }
    }
    return false;
}

From source file:org.voyanttools.trombone.lucene.CorpusMapper.java

License:Open Source License

/**
 * This should not be called, except from the private build() method.
 * @throws IOException//from  w  ww .ja va 2 s.co  m
 */
private void buildFromTermsEnum() throws IOException {
    LeafReader reader = SlowCompositeReaderWrapper
            .wrap(storage.getLuceneManager().getDirectoryReader(corpus.getId()));

    Terms terms = reader.terms("id");
    TermsEnum termsEnum = terms.iterator();
    BytesRef bytesRef = termsEnum.next();
    int doc;
    String id;
    Set<String> ids = new HashSet<String>(getCorpusDocumentIds());
    bitSet = new SparseFixedBitSet(reader.numDocs());
    Bits liveBits = reader.getLiveDocs();
    while (bytesRef != null) {
        PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE);
        doc = postingsEnum.nextDoc();
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            id = bytesRef.utf8ToString();
            if (ids.contains(id)) {
                bitSet.set(doc);
                luceneIds.add(doc);
                documentIdToLuceneIdMap.put(id, doc);
                luceneIdToDocumentIdMap.put(doc, id);
            }
        }
        bytesRef = termsEnum.next();
    }
    this.reader = new FilteredCorpusReader(reader, bitSet);
}

From source file:org.voyanttools.trombone.tool.corpus.AbstractContextTerms.java

License:Open Source License

private void fillTermsOfInterest(LeafReader LeafReader, int luceneDoc, Map<Integer, TermInfo> termsOfInterest)
        throws IOException {
    // fill in terms of interest
    Terms terms = LeafReader.getTermVector(luceneDoc, tokenType.name());
    TermsEnum termsEnum = terms.iterator();
    while (true) {
        BytesRef term = termsEnum.next();
        if (term != null) {
            String termString = term.utf8ToString();
            PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
            if (postingsEnum != null) {
                postingsEnum.nextDoc();//from   www  .j a  va  2s .  c  o m
                for (int i = 0, len = postingsEnum.freq(); i < len; i++) {
                    int pos = postingsEnum.nextPosition();
                    if (termsOfInterest.containsKey(pos)) {
                        termsOfInterest.put(pos, new TermInfo(termString, postingsEnum.startOffset(),
                                postingsEnum.endOffset(), pos, 1));
                    }
                }
            }
        } else {
            break;
        }
    }
}

From source file:org.voyanttools.trombone.tool.corpus.CorpusTerms.java

License:Open Source License

private FlexibleQueue<CorpusTerm> runAllTermsWithDistributionsDocumentTermVectors(CorpusMapper corpusMapper,
        Keywords stopwords) throws IOException {
    FlexibleQueue<CorpusTerm> queue = new FlexibleQueue<CorpusTerm>(comparator, start + limit);

    LeafReader reader = corpusMapper.getLeafReader();
    Map<String, Map<Integer, Integer>> rawFreqsMap = new HashMap<String, Map<Integer, Integer>>();
    TermsEnum termsEnum = null;// w  w w  .j  av  a 2s . co  m
    for (int doc : corpusMapper.getLuceneIds()) {
        Terms terms = reader.getTermVector(doc, tokenType.name());
        if (terms != null) {
            termsEnum = terms.iterator();
            if (termsEnum != null) {
                BytesRef bytesRef = termsEnum.next();
                while (bytesRef != null) {
                    String term = bytesRef.utf8ToString();
                    if (!stopwords.isKeyword(term)) {
                        if (!rawFreqsMap.containsKey(term)) {
                            rawFreqsMap.put(term, new HashMap<Integer, Integer>());
                        }
                        int rawF = (int) termsEnum.totalTermFreq();
                        if (rawF > minRawFreq) {
                            rawFreqsMap.get(term).put(corpusMapper.getDocumentPositionFromLuceneId(doc), rawF);
                        }
                    }
                    bytesRef = termsEnum.next();
                }
            }
        }
    }

    int corpusSize = corpusMapper.getCorpus().size();
    int[] tokensCounts = corpusMapper.getCorpus().getTokensCounts(tokenType);
    int totalCorpusTokens = corpusMapper.getCorpus().getTokensCount(tokenType);
    int bins = parameters.getParameterIntValue("bins", corpusSize);
    int[] documentRawFreqs;
    float[] documentRelativeFreqs;
    int documentPosition;
    int termFreq;
    int freq;
    for (Map.Entry<String, Map<Integer, Integer>> termsMap : rawFreqsMap.entrySet()) {
        String termString = termsMap.getKey();
        documentRawFreqs = new int[corpusSize];
        documentRelativeFreqs = new float[corpusSize];
        termFreq = 0;
        for (Map.Entry<Integer, Integer> docsMap : termsMap.getValue().entrySet()) {
            documentPosition = docsMap.getKey();
            freq = docsMap.getValue();
            termFreq += freq;
            totalTokens += freq;
            documentRawFreqs[documentPosition] = freq;
            documentRelativeFreqs[documentPosition] = (float) freq / tokensCounts[documentPosition];
        }
        //total++;
        if (termFreq > minRawFreq) {
            CorpusTerm corpusTerm = new CorpusTerm(termString, termFreq, totalCorpusTokens,
                    termsMap.getValue().size(), corpusSize, documentRawFreqs, documentRelativeFreqs, bins);
            offer(queue, corpusTerm);
        }
        //         queue.offer(new CorpusTerm(termString, termFreq, totalTokens, termsMap.getValue().size(), corpusSize, documentRawFreqs, documentRelativeFreqs, bins));
    }
    return queue;
}

From source file:org.voyanttools.trombone.tool.corpus.DocumentNgrams.java

License:Open Source License

private SimplifiedTermInfo[] getSparseSimplifiedTermInfoArray(CorpusMapper corpusMapper, int luceneDoc,
        int lastTokenOffset) throws IOException {

    Keywords stopwords = this.getStopwords(corpusMapper.getCorpus());
    Terms terms = corpusMapper.getLeafReader().getTermVector(luceneDoc, tokenType.name());
    TermsEnum termsEnum = terms.iterator();
    SimplifiedTermInfo[] simplifiedTermInfoArray = new SimplifiedTermInfo[lastTokenOffset + 1];
    while (true) {
        BytesRef term = termsEnum.next();
        if (term != null) {
            String termString = term.utf8ToString();
            //if (stopwords.isKeyword(termString)) {continue;} // treat as whitespace or punctuation
            PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
            while (postingsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
                int freq = postingsEnum.freq();
                for (int i = 0, len = freq; i < len; i++) {
                    int pos = postingsEnum.nextPosition();
                    new SimplifiedTermInfo(termString, pos, 1, freq, postingsEnum.startOffset(),
                            postingsEnum.endOffset());
                    simplifiedTermInfoArray[pos] = freq > 1
                            ? new SimplifiedTermInfo(termString, pos, 1, freq, postingsEnum.startOffset(),
                                    postingsEnum.endOffset())
                            : new SimplifiedTermInfo(""); // empty string if not repeating
                }//from   ww w  . j a v  a 2s. com
            }
        } else {
            break;
        }
    }
    return simplifiedTermInfoArray;
}

From source file:org.voyanttools.trombone.tool.corpus.DocumentTerms.java

License:Open Source License

private void runAllTermsFromDocumentTermVectors(CorpusMapper corpusMapper, Keywords stopwords)
        throws IOException {
    FlexibleQueue<DocumentTerm> queue = new FlexibleQueue<DocumentTerm>(comparator, start + limit);
    LeafReader reader = corpusMapper.getLeafReader();
    Corpus corpus = corpusMapper.getCorpus();
    CorpusTermMinimalsDB corpusTermMinimalsDB = CorpusTermMinimalsDB.getInstance(corpusMapper, tokenType);
    TermsEnum termsEnum = null;//from   w w  w. j ava 2  s.  c  o m
    Bits docIdBitSet = corpusMapper
            .getBitSetFromDocumentIds(this.getCorpusStoredDocumentIdsFromParameters(corpus));
    Bits allBits = new Bits.MatchAllBits(reader.numDocs());
    int[] tokenCounts = corpus.getTokensCounts(tokenType);
    float[] typesCountMeans = corpus.getTypesCountMeans(tokenType);
    float[] typesCountStdDev = corpus.getTypesCountStdDevs(tokenType);
    for (int doc : corpusMapper.getLuceneIds()) {
        if (!docIdBitSet.get(doc)) {
            continue;
        }
        FlexibleQueue<DocumentTerm> docQueue = new FlexibleQueue<DocumentTerm>(comparator,
                limit * docIdBitSet.length());
        int documentPosition = corpusMapper.getDocumentPositionFromLuceneId(doc);
        String docId = corpusMapper.getDocumentIdFromLuceneId(doc);
        float mean = typesCountMeans[documentPosition];
        float stdDev = typesCountStdDev[documentPosition];
        int totalTokensCount = tokenCounts[documentPosition];
        Terms terms = reader.getTermVector(doc, tokenType.name());
        if (terms != null) {
            termsEnum = terms.iterator();
            if (termsEnum != null) {
                BytesRef bytesRef = termsEnum.next();

                while (bytesRef != null) {
                    String termString = bytesRef.utf8ToString();
                    if (whiteList.isEmpty() == false && whiteList.isKeyword(termString) == false) {
                        bytesRef = termsEnum.next();
                        continue;
                    }
                    if (!stopwords.isKeyword(termString)) {
                        CorpusTermMinimal corpusTermMinimal = corpusTermMinimalsDB.get(termString);
                        int[] positions = null;
                        int[] offsets = null;
                        int freq;
                        if (isNeedsPositions || isNeedsOffsets) {
                            PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
                            postingsEnum.nextDoc();
                            freq = postingsEnum.freq();
                            positions = new int[freq];
                            offsets = new int[freq];
                            for (int i = 0; i < freq; i++) {
                                positions[i] = postingsEnum.nextPosition();
                                offsets[i] = postingsEnum.startOffset();
                            }
                        } else {
                            freq = (int) termsEnum.totalTermFreq();
                        }
                        if (freq >= minRawFreq) {
                            total++;
                            float zscore = stdDev != 0 ? ((freq - mean) / stdDev) : Float.NaN;
                            DocumentTerm documentTerm = new DocumentTerm(documentPosition, docId, termString,
                                    freq, totalTokensCount, zscore, positions, offsets, corpusTermMinimal);
                            docQueue.offer(documentTerm);
                        }
                    }
                    bytesRef = termsEnum.next();
                }
            }
        }
        int i = 0;
        for (DocumentTerm docTerm : docQueue.getOrderedList()) {
            queue.offer(docTerm);
            if (++i >= perDocLimit) {
                break;
            }
        }
    }
    corpusTermMinimalsDB.close();
    this.terms.addAll(queue.getOrderedList(start));
}

From source file:org.zenoss.zep.index.impl.lucene.LuceneQueryBuilder.java

License:Open Source License

private static Term[] getMatchingTerms(String fieldName, IndexReader reader, String value) throws ZepException {
    // Don't search for matches if text doesn't contain wildcards
    if (value.indexOf('*') == -1 && value.indexOf('?') == -1)
        return new Term[] { new Term(fieldName, value) };

    logger.debug("getMatchingTerms: field={}, value={}", fieldName, value);
    List<Term> matches = new ArrayList<Term>();
    Automaton automaton = WildcardQuery.toAutomaton(new Term(fieldName, value));
    CompiledAutomaton compiled = new CompiledAutomaton(automaton);
    try {/*from   w w  w. j  ava  2s . c  om*/
        Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(fieldName);
        TermsEnum wildcardTermEnum = compiled.getTermsEnum(terms);
        BytesRef match;
        while (wildcardTermEnum.next() != null) {
            match = wildcardTermEnum.term();
            logger.debug("Match: {}", match);
            matches.add(new Term(fieldName, match.utf8ToString()));
        }
        return matches.toArray(new Term[matches.size()]);
    } catch (IOException e) {
        throw new ZepException(e.getLocalizedMessage(), e);
    }
}

From source file:perf.ShowFields.java

License:Apache License

public static void main(String[] args) throws CorruptIndexException, IOException {
    DirectoryReader reader = DirectoryReader.open(FSDirectory.open(
            new File("/home/simon/work/projects/lucene/bench/indices/Standard.work.trunk.wiki.nd0.1M/index")));
    Fields fields = MultiFields.getFields(reader);
    for (String name : fields) {
        System.out.println(name);
        if (name.equals("docdate")) {
            TermsEnum terms = fields.terms(name).iterator(null);
            BytesRef ref;
            int i = 0;
            while ((ref = terms.next()) != null) {
                System.out.println(ref.utf8ToString());
                if (i++ == 10) {
                    break;
                }/* ww w. j  a va2s.  co  m*/
            }
        }
    }
}

From source file:pretraga.IsolationSimilarity.java

public void test(String vec) {
    List<String> vector = processInput(vec);
    HashMap<String, Long> map = new HashMap<>();
    try {/*from  ww  w . ja  v  a  2 s . c om*/
        Directory dir = FSDirectory.open(new File(indexDirectoryPath).toPath());

        IndexReader reader = DirectoryReader.open(dir);
        IndexSearcher searcher = new IndexSearcher(reader);

        List<Integer> docId = getDocumentsFromVector(vector, reader, searcher);

        for (int i = 0; i < docId.size(); i++) {
            Fields ff = reader.getTermVectors(docId.get(i));
            Terms terms = ff.terms(CONTENT);

            TermsEnum te = terms.iterator();
            Object tmp = te.next();
            while (tmp != null) {
                BytesRef by = (BytesRef) tmp;
                String term = by.utf8ToString();

                ClassicSimilarity sim = null;
                if (searcher.getSimilarity(true) instanceof ClassicSimilarity) {
                    sim = (ClassicSimilarity) searcher.getSimilarity(true);
                }
                float idf = sim.idf(te.docFreq(), reader.maxDoc());
                float tf = sim.tf(te.totalTermFreq());
                //System.out.println("idf = " + idf + ", tf = " + tf + ", docF: " + te.totalTermFreq());
                TermStatistics ts = new TermStatistics(by, te.docFreq(), te.totalTermFreq());
                CollectionStatistics s = new CollectionStatistics(CONTENT, reader.maxDoc(), terms.getDocCount(),
                        terms.getSumTotalTermFreq(), terms.getSumDocFreq());
                Document d = reader.document(docId.get(i));
                if (vector.contains(term)) {
                    float ttt = sim.simScorer(sim.computeWeight(s, ts), reader.getContext().leaves().get(0))
                            .score(docId.get(i), te.totalTermFreq());
                    System.out.println(ttt + ", " + d.get(TITLE) + ", term: " + term);
                }
                tmp = te.next();
            }

            /*Iterator<String> ss = ff.iterator();
            while (ss.hasNext()) {
            String fieldString = ss.next();
            System.out.println(fieldString);
            }*/
        }
    } catch (Exception e) {

    }
}