Example usage for org.apache.lucene.util BytesRef utf8ToString

List of usage examples for org.apache.lucene.util BytesRef utf8ToString

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef utf8ToString.

Prototype

public String utf8ToString() 

Source Link

Document

Interprets stored bytes as UTF8 bytes, returning the resulting string

Usage

From source file:game.TermFreq.java

void loadTfVec() throws Exception {

    IndexReader reader = retriever.getReader();
    long sumDf = reader.getSumDocFreq(TrecDocRetriever.FIELD_ANALYZED_CONTENT);

    Terms terms = reader.getTermVector(luceneDocIdToGuess, FIELD_ANALYZED_CONTENT);
    if (terms == null || terms.size() == 0)
        return;//from   w  ww.j av  a  2  s. c o  m

    TermsEnum termsEnum;
    BytesRef term;
    tfvec = new ArrayList<>();

    // Construct the normalized tf vector
    termsEnum = terms.iterator(null); // access the terms for this field
    int doclen = 0;
    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        String termStr = term.utf8ToString();
        String stem = retriever.analyze(termStr);
        DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one
        while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            //get the term frequency in the document
            int tf = docsEnum.freq();
            TermFreq tfq = new TermFreq(new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, term), termStr, tf);
            tfvec.add(tfq);

            doclen += tf;
        }
    }

    for (TermFreq tf : tfvec) {
        tf.tf = tf.tf / (float) doclen; // normalize by len
        float idf = sumDf / reader.docFreq(tf.term);
        tf.wt = (float) (Math.log(1 + LAMBDA / (ONE_MINUS_LAMBDA) * tf.tf * idf));
    }

    Collections.sort(tfvec);
}

From source file:indexer.OptimizedRealValuedVecIndexer.java

void processAllDocumentWise() throws Exception {
    Cell cell, requantizedCell;/* w  w w. ja  v  a 2 s  .  c om*/
    BytesRef term;
    Terms tfvector;
    TermsEnum termsEnum;

    Set<Cell> splitCells = new HashSet<>();

    int numDocs = reader.numDocs();

    for (int i = 0; i < numDocs; i++) {
        tfvector = reader.getTermVector(i, DocVector.FIELD_CELL_ID);
        termsEnum = tfvector.iterator(); // access the terms for this field

        StringBuffer requantizedCells = new StringBuffer();
        DocVector p = new DocVector(reader.document(i), numDimensions, numIntervals, null);

        // iterate for each cell in this document
        while ((term = termsEnum.next()) != null) { // explore the terms for this field
            String cellId = term.utf8ToString(); // one cell docName
            cell = new Cell(cellId);

            if (cell.toSplit(reader)) { // do we need to requantize?
                splitCells.add(cell); // mark this cell
                requantizedCell = cell.quantize(p); // this function returns a new object
                System.out.println("Cell " + cell + " updated to " + requantizedCell);

                requantizedCells.append(requantizedCell).append(" ");
            } else {
                requantizedCells.append(cell).append(" ");
            }
        }
        p.setQuantized(requantizedCells.toString());
        writer.addDocument(p.constructDoc());
    }

    saveSplitCells(writer, new ArrayList<Cell>(splitCells));

    reader.close();
    writer.close();
}

From source file:indextranslator.BOWTranslator.java

public void translate(String docIdStr, int docId) throws Exception {
    String termText;//  w  w w.j  av a2  s. co  m
    BytesRef term;
    Terms tfvector;
    TermsEnum termsEnum;
    int tf;

    tfvector = reader.getTermVector(docId, TextDocIndexer.FIELD_ANALYZED_CONTENT);
    if (tfvector == null || tfvector.size() == 0)
        return;

    // Construct the normalized tf vector
    termsEnum = tfvector.iterator(); // access the terms for this field
    StringBuffer buff = new StringBuffer();

    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        tf = (int) termsEnum.totalTermFreq();
        termText = term.utf8ToString();
        buff.append(dict.getTranslations(termText, tf)).append("\n");
    }

    Document doc = constructDoc(docIdStr, buff.toString());
    writer.addDocument(doc);
}

From source file:io.anserini.integration.IndexerTest.java

License:Apache License

private void dumpPostings(IndexReader reader) throws IOException {
    // This is how you iterate through terms in the postings list.
    LeafReader leafReader = reader.leaves().get(0).reader();
    TermsEnum termsEnum = leafReader.terms("text").iterator();
    BytesRef bytesRef = termsEnum.next();
    while (bytesRef != null) {
        // This is the current term in the dictionary.
        String token = bytesRef.utf8ToString();
        Term term = new Term("text", token);
        System.out.print(token + " (df = " + reader.docFreq(term) + "):");

        PostingsEnum postingsEnum = leafReader.postings(term);
        while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            System.out.print(String.format(" (%s, %s)", postingsEnum.docID(), postingsEnum.freq()));
        }/*ww w.  j av  a2 s  .c  o  m*/
        System.out.println("");

        bytesRef = termsEnum.next();
    }
}

From source file:io.anserini.rerank.lib.Rm3Reranker.java

License:Apache License

private FeatureVector createdFeatureVector(Terms terms, IndexReader reader, boolean tweetsearch) {
    FeatureVector f = new FeatureVector();

    try {/*from w w w  . j a v a2  s.c o m*/
        int numDocs = reader.numDocs();
        TermsEnum termsEnum = terms.iterator();

        BytesRef text;
        while ((text = termsEnum.next()) != null) {
            String term = text.utf8ToString();

            if (term.length() < 2 || term.length() > 20)
                continue;
            if (!term.matches("[a-z0-9]+"))
                continue;

            // This seemingly arbitrary logic needs some explanation. See following PR for details:
            //   https://github.com/castorini/Anserini/pull/289
            //
            // We have long known that stopwords have a big impact in RM3. If we include stopwords
            // in feedback, effectiveness is affected negatively. In the previous implementation, we
            // built custom stopwords lists by selecting top k terms from the collection. We only
            // had two stopwords lists, for gov2 and for Twitter. The gov2 list is used on all
            // collections other than Twitter.
            //
            // The logic below instead uses a df threshold: If a term appears in more than n percent
            // of the documents, then it is discarded as a feedback term. This heuristic has the
            // advantage of getting rid of collection-specific stopwords lists, but at the cost of
            // introducing an additional tuning parameter.
            //
            // Cognizant of the dangers of (essentially) tuning on test data, here's what I
            // (@lintool) did:
            //
            // + For newswire collections, I picked a number, 10%, that seemed right. This value
            //   actually increased effectiveness in most conditions across all newswire collections.
            //
            // + This 10% value worked fine on web collections; effectiveness didn't change much.
            //
            // Since this was the first and only heuristic value I selected, we're not really tuning
            // parameters.
            //
            // The 10% threshold, however, doesn't work well on tweets because tweets are much
            // shorter. Based on a list terms in the collection by df: For the Tweets2011 collection,
            // I found a threshold close to a nice round number that approximated the length of the
            // current stopwords list, by eyeballing the df values. This turned out to be 1%. I did
            // this again for the Tweets2013 collection, using the same approach, and obtained a value
            // of 0.7%.
            //
            // With both values, we obtained effectiveness pretty close to the old values with the
            // custom stopwords list.
            int df = reader.docFreq(new Term(FIELD_BODY, term));
            float ratio = (float) df / numDocs;
            if (tweetsearch) {
                if (numDocs > 100000000) { // Probably Tweets2013
                    if (ratio > 0.007f)
                        continue;
                } else {
                    if (ratio > 0.01f)
                        continue;
                }
            } else if (ratio > 0.1f)
                continue;

            int freq = (int) termsEnum.totalTermFreq();
            f.addFeatureWeight(term, (float) freq);
        }
    } catch (Exception e) {
        e.printStackTrace();
        // Return empty feature vector
        return f;
    }

    return f;
}

From source file:io.anserini.util.ExtractTopDfTerms.java

License:Apache License

public static void main(String[] args) throws Exception {
    Args myArgs = new Args();
    CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90));

    try {/*from  w  w w  . jav  a  2  s  .  c  om*/
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: ExtractTopDfTerms" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }

    Directory dir = FSDirectory.open(Paths.get(myArgs.index));
    IndexReader reader = DirectoryReader.open(dir);
    int numDocs = reader.numDocs();

    Comparator<Pair> comp = new Comparator<Pair>() {
        @Override
        public int compare(Pair p1, Pair p2) {
            if (p1.value == p2.value) {
                return p1.key.compareTo(p2.key);
            } else
                return (p1.value < p2.value) ? -1 : 1;
        }
    };

    PriorityQueue<Pair> queue = new PriorityQueue<Pair>(myArgs.topK, comp);

    LOG.info("Starting to iterate through all terms...");
    Terms terms = MultiFields.getFields(reader).terms(myArgs.field);
    TermsEnum termsEnum = terms.iterator();
    BytesRef text;
    int cnt = 0;
    while ((text = termsEnum.next()) != null) {
        String term = text.utf8ToString();
        if (term.length() == 0)
            continue;

        Pair p = new Pair(term, reader.docFreq(new Term(myArgs.field, term)));
        if (queue.size() < myArgs.topK) {
            queue.add(p);
        } else {
            if (comp.compare(p, queue.peek()) > 0) {
                queue.poll();
                queue.add(p);
            }
        }

        cnt++;
        if (cnt % 1000000 == 0) {
            LOG.info("At term " + term);
        }
    }

    PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output)));
    Pair pair;
    while ((pair = queue.poll()) != null) {
        out.println(pair.key + "\t" + pair.value + "\t" + numDocs + "\t" + ((float) pair.value / numDocs));
    }
    out.close();

    LOG.info("Done!");
}

From source file:io.crate.analyze.NumberOfReplicasTest.java

License:Apache License

@Test
public void testFromEmptySettings() throws Exception {
    BytesRef numberOfResplicas = NumberOfReplicas.fromSettings(Settings.EMPTY);
    assertThat(numberOfResplicas.utf8ToString(), is("1"));
}

From source file:io.crate.analyze.NumberOfReplicasTest.java

License:Apache License

@Test
public void testNumber() throws Exception {
    BytesRef numberOfResplicas = NumberOfReplicas
            .fromSettings(Settings.builder().put(NumberOfReplicas.NUMBER_OF_REPLICAS, 4).build());
    assertThat(numberOfResplicas.utf8ToString(), is("4"));
}

From source file:io.crate.analyze.NumberOfReplicasTest.java

License:Apache License

@Test
public void testAutoExpandSettingsTakePrecedence() throws Exception {
    BytesRef numberOfResplicas = NumberOfReplicas
            .fromSettings(Settings.builder().put(NumberOfReplicas.AUTO_EXPAND_REPLICAS, "0-all")
                    .put(NumberOfReplicas.NUMBER_OF_REPLICAS, 1).build());
    assertThat(numberOfResplicas.utf8ToString(), is("0-all"));
}

From source file:io.crate.core.NumberOfReplicasTest.java

License:Apache License

@Test
public void testFromEmptySettings() throws Exception {
    BytesRef numberOfResplicas = NumberOfReplicas.fromSettings(ImmutableSettings.EMPTY);
    assertThat(numberOfResplicas.utf8ToString(), is("1"));
}