Example usage for org.apache.lucene.util BytesRef utf8ToString

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef utf8ToString.

Prototype

public String utf8ToString()

Source Link

Document

Interprets stored bytes as UTF8 bytes, returning the resulting string

Usage

From source file:org.apache.blur.analysis.type.AclDiscoverFieldTypeDefinition.java

License:Apache License

@Override
public String readTerm(BytesRef byteRef) {
    return byteRef.utf8ToString();
}

From source file:org.apache.blur.analysis.type.spatial.BaseSpatialFieldTypeDefinitionTest.java

License:Apache License

protected void runGisDocValueTest(String s) throws IOException {
    DirectoryReader reader = DirectoryReader.open(_dir);
    AtomicReader atomicReader = reader.leaves().get(0).reader();
    SortedDocValues sortedDocValues = atomicReader.getSortedDocValues("fam.geo");
    BytesRef result = new BytesRef();
    sortedDocValues.get(0, result);/*from   w ww. j  av a 2s . c  om*/
    assertEquals(s, result.utf8ToString());
    System.out.println(result.utf8ToString());
    reader.close();
}

From source file:org.apache.blur.command.TermsCommand.java

License:Apache License

private static List<String> terms(IndexReader reader, String fieldName, String startWith, short size)
        throws IOException {

    Term term = getTerm(fieldName, startWith);
    List<String> terms = new ArrayList<String>(size);
    AtomicReader areader = BlurUtil.getAtomicReader(reader);
    Terms termsAll = areader.terms(term.field());

    if (termsAll == null) {
        return terms;
    }/*from  w  w  w .  ja  v a 2  s .  co  m*/

    TermsEnum termEnum = termsAll.iterator(null);

    SeekStatus status = termEnum.seekCeil(term.bytes());

    if (status == SeekStatus.END) {
        return terms;
    }

    BytesRef currentTermText = termEnum.term();
    do {
        terms.add(currentTermText.utf8ToString());
        if (terms.size() >= size) {
            return terms;
        }
    } while ((currentTermText = termEnum.next()) != null);
    return terms;
}

From source file:org.apache.blur.lucene.serializer.SerializerUtil.java

License:Apache License

public static String readString(DataInput in) throws IOException {
    BytesRef bytes = readBytesRef(in);
    return bytes.utf8ToString();
}

From source file:org.apache.blur.manager.IndexManager.java

License:Apache License

public static List<String> terms(IndexReader reader, FieldTypeDefinition typeDef, String columnFamily,
        String columnName, String startWith, short size) throws IOException {
    if (startWith == null) {
        startWith = "";
    }//from   w w  w  .j  av a 2  s .  c om
    Term term = getTerm(columnFamily, columnName, startWith);
    List<String> terms = new ArrayList<String>(size);
    AtomicReader areader = BlurUtil.getAtomicReader(reader);
    Terms termsAll = areader.terms(term.field());

    if (termsAll == null) {
        return terms;
    }

    TermsEnum termEnum = termsAll.iterator(null);
    SeekStatus status = termEnum.seekCeil(term.bytes());

    if (status == SeekStatus.END) {
        return terms;
    }

    BytesRef currentTermText = termEnum.term();
    do {
        terms.add(currentTermText.utf8ToString());
        String readTerm = typeDef.readTerm(currentTermText);
        if (readTerm != null)
            terms.add(readTerm);
        if (terms.size() >= size) {
            return terms;
        }
    } while ((currentTermText = termEnum.next()) != null);
    return terms;
}

From source file:org.apache.ctakes.utils.wiki.WikiIndex.java

License:Apache License

/**
 * Return a hash table that maps terms to their tfidf values.
 * The input is a list of TermFreqVector objects. The return
 * value is formed by summing up individual tfidf vectors.
 *//*from ww  w .  ja  va2s .  com*/
private HashMap<String, Double> makeTfIdfVector(ArrayList<Terms> termFreqVectors) throws IOException {

    // map terms to their tfidf values
    CounterMap<String> countVector = new CounterMap<String>();
    HashMap<String, Double> tfIdfVector = new HashMap<String, Double>();

    for (Terms terms : termFreqVectors) {
        if (terms == null) {
            continue; // some documents are empty
        }

        //        String[] terms = termFreqVector.getTerms();
        //        int[] freqs = termFreqVector.getTermFrequencies();
        TermsEnum termsEnum = terms.iterator(null);

        while (termsEnum.next() != null) {
            BytesRef term = termsEnum.term();
            String termStr = term.utf8ToString();
            countVector.add(termStr);
        }

        for (String key : countVector.keySet()) {
            double tf = similarity.tf((long) countVector.get(key));
            double idf = similarity.idf(indexReader.docFreq(new Term("text", key)), numDocs);
            tfIdfVector.put(key, tf * idf);
        }
        /*        for(int i = 0; i < terms.length; i++) {
                   double tf = similarity.tf(freqs[i]); // defaultSimilarity.tf(freqs[i]);
                   double idf = similarity.idf(indexReader.docFreq(new Term("text", terms[i])), numDocs);
                           
                   if(tfIdfVector.containsKey(terms[i])) {
                      tfIdfVector.put(terms[i], tfIdfVector.get(terms[i]) + tf * idf);
                   }
                   else {
                      tfIdfVector.put(terms[i], tf * idf);
                   }
                } */
    }
    return tfIdfVector;
}

From source file:org.apache.mahout.utils.vectors.lucene.CachedTermInfo.java

License:Apache License

public CachedTermInfo(IndexReader reader, String field, int minDf, int maxDfPercent) throws IOException {
    this.field = field;
    Terms t = MultiFields.getTerms(reader, field);
    TermsEnum te = t.iterator(null);//from   ww  w . j  av a 2 s.c  o  m

    int numDocs = reader.numDocs();
    double percent = numDocs * maxDfPercent / 100.0;
    //Should we use a linked hash map so that we know terms are in order?
    termEntries = Maps.newLinkedHashMap();
    int count = 0;
    BytesRef text;
    while ((text = te.next()) != null) {
        int df = te.docFreq();
        if (df >= minDf && df <= percent) {
            TermEntry entry = new TermEntry(text.utf8ToString(), count++, df);
            termEntries.put(entry.getTerm(), entry);
        }
    }
}

From source file:org.apache.mahout.utils.vectors.lucene.ClusterLabels.java

License:Apache License

/**
 * Get the list of labels, sorted by best score.
 *//*from  ww w.  j a v a 2 s.c o m*/
protected List<TermInfoClusterInOut> getClusterLabels(Integer integer,
        Collection<WeightedPropertyVectorWritable> wpvws) throws IOException {

    if (wpvws.size() < minNumIds) {
        log.info("Skipping small cluster {} with size: {}", integer, wpvws.size());
        return null;
    }

    log.info("Processing Cluster {} with {} documents", integer, wpvws.size());
    Directory dir = FSDirectory.open(new File(this.indexDir));
    IndexReader reader = DirectoryReader.open(dir);

    log.info("# of documents in the index {}", reader.numDocs());

    Collection<String> idSet = Sets.newHashSet();
    for (WeightedPropertyVectorWritable wpvw : wpvws) {
        Vector vector = wpvw.getVector();
        if (vector instanceof NamedVector) {
            idSet.add(((NamedVector) vector).getName());
        }
    }

    int numDocs = reader.numDocs();

    OpenBitSet clusterDocBitset = getClusterDocBitset(reader, idSet, this.idField);

    log.info("Populating term infos from the index");

    /**
     * This code is as that of CachedTermInfo, with one major change, which is to get the document frequency.
     * 
     * Since we have deleted the documents out of the cluster, the document frequency for a term should only
     * include the in-cluster documents. The document frequency obtained from TermEnum reflects the frequency
     * in the entire index. To get the in-cluster frequency, we need to query the index to get the term
     * frequencies in each document. The number of results of this call will be the in-cluster document
     * frequency.
     */
    Terms t = MultiFields.getTerms(reader, contentField);
    TermsEnum te = t.iterator(null);
    Map<String, TermEntry> termEntryMap = new LinkedHashMap<String, TermEntry>();
    Bits liveDocs = MultiFields.getLiveDocs(reader); //WARNING: returns null if there are no deletions

    int count = 0;
    BytesRef term;
    while ((term = te.next()) != null) {
        OpenBitSet termBitset = new OpenBitSet(reader.maxDoc());
        DocsEnum docsEnum = MultiFields.getTermDocsEnum(reader, null, contentField, term);
        int docID;
        while ((docID = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            //check to see if we don't have an deletions (null) or if document is live
            if (liveDocs != null && !liveDocs.get(docID)) {
                // document is deleted...
                termBitset.set(docsEnum.docID());
            }
        }
        // AND the term's bitset with cluster doc bitset to get the term's in-cluster frequency.
        // This modifies the termBitset, but that's fine as we are not using it anywhere else.
        termBitset.and(clusterDocBitset);
        int inclusterDF = (int) termBitset.cardinality();

        TermEntry entry = new TermEntry(term.utf8ToString(), count++, inclusterDF);
        termEntryMap.put(entry.getTerm(), entry);

    }

    List<TermInfoClusterInOut> clusteredTermInfo = Lists.newLinkedList();

    int clusterSize = wpvws.size();

    for (TermEntry termEntry : termEntryMap.values()) {

        int corpusDF = reader.docFreq(new Term(this.contentField, termEntry.getTerm()));
        int outDF = corpusDF - termEntry.getDocFreq();
        int inDF = termEntry.getDocFreq();
        double logLikelihoodRatio = scoreDocumentFrequencies(inDF, outDF, clusterSize, numDocs);
        TermInfoClusterInOut termInfoCluster = new TermInfoClusterInOut(termEntry.getTerm(), inDF, outDF,
                logLikelihoodRatio);
        clusteredTermInfo.add(termInfoCluster);
    }

    Collections.sort(clusteredTermInfo);
    // Cleanup
    Closeables.close(reader, true);
    termEntryMap.clear();

    return clusteredTermInfo.subList(0, Math.min(clusteredTermInfo.size(), maxLabels));
}

From source file:org.apache.mahout.utils.vectors.lucene.TFDFMapper.java

License:Apache License

public void map(BytesRef term, int frequency) {
    TermEntry entry = termInfo.getTermEntry(field, term.utf8ToString());
    if (entry != null) {
        vector.setQuick(entry.getTermIdx(),
                weight.calculate(frequency, entry.getDocFreq(), (int) numTerms, numDocs));
    }/*from   ww  w.j  av  a 2 s.c o m*/
}

From source file:org.apache.oodt.cas.filemgr.browser.model.QueryBuilder.java

License:Apache License

public void GenerateCASQuery(org.apache.oodt.cas.filemgr.structs.Query casQ,
        org.apache.lucene.search.Query luceneQ) {
    if (luceneQ instanceof TermQuery) {
        Term t = ((TermQuery) luceneQ).getTerm();
        if (!t.field().equals("__FREE__")) {
            String element = database.getElementID(t.field());
            if (!element.equals("") && !t.text().equals("")) {

                casQ.addCriterion(new TermQueryCriteria(element, t.text()));
            }/*  w w w  .  j  a  va 2  s .com*/
        }
    } else if (luceneQ instanceof PhraseQuery) {
        Term[] t = ((PhraseQuery) luceneQ).getTerms();
        if (!t[0].field().equals("__FREE__")) {
            for (Term aT : t) {
                String element = database.getElementID(aT.field());
                if (!element.equals("") && !aT.text().equals("")) {
                    casQ.addCriterion(new TermQueryCriteria(element, aT.text()));
                }
            }
        }
    } else if (luceneQ instanceof TermRangeQuery) {
        BytesRef startT = ((TermRangeQuery) luceneQ).getLowerTerm();
        BytesRef endT = ((TermRangeQuery) luceneQ).getUpperTerm();
        String element = database.getElementID(((TermRangeQuery) luceneQ).getField());
        if (!element.equals("") && !startT.utf8ToString().equals("") && !endT.utf8ToString().equals("")) {
            casQ.addCriterion(new RangeQueryCriteria(element, startT.utf8ToString(), endT.utf8ToString()));
        }
    } else if (luceneQ instanceof BooleanQuery) {
        List<BooleanClause> clauses = ((BooleanQuery) luceneQ).clauses();
        for (BooleanClause clause : clauses) {
            GenerateCASQuery(casQ, (clause).getQuery());
        }
    } else {
        System.out.println("Error Parsing Query");
        System.exit(-1);
    }
}