Example usage for org.apache.lucene.util BytesRef utf8ToString

List of usage examples for org.apache.lucene.util BytesRef utf8ToString

Introduction

In this page you can find the example usage for org.apache.lucene.util BytesRef utf8ToString.

Prototype

public String utf8ToString() 

Source Link

Document

Interprets stored bytes as UTF8 bytes, returning the resulting string

Usage

From source file:retrievability.SampledQuery.java

ArrayList<String> collectTerms(String indexName) throws Exception {
    ArrayList<String> termArray = new ArrayList<>();

    File indexDir = new File(prop.getProperty(indexName));
    try (IndexReader indexReader = DirectoryReader.open(FSDirectory.open(indexDir))) {
        Terms terms = MultiFields.getTerms(indexReader, TweetIndexer.FIELD_ANALYZED_CONTENT);

        for (TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY);;) {

            BytesRef text = termsEnum.next();
            //System.err.println(text.utf8ToString());
            if (text == null)
                break;
            termArray.add(text.utf8ToString());
        }/*from w  w w .  j a  v  a  2  s. c  om*/
    }
    return termArray;
}

From source file:retriever.TermStats.java

public List<TermStats> build() throws Exception {
    String termText;/*  w  ww .  j a  v  a 2 s . c o m*/
    BytesRef term;
    Terms tfvector;
    TermsEnum termsEnum;
    int docLen = 0;
    int tf;

    tfvector = reader.getTermVector(docId, TextDocIndexer.FIELD_ANALYZED_CONTENT);
    if (tfvector == null || tfvector.size() == 0)
        return null;

    // Construct the normalized tf vector
    termsEnum = tfvector.iterator(); // access the terms for this field

    while ((term = termsEnum.next()) != null) { // explore the terms for this field            
        tf = (int) termsEnum.totalTermFreq();
        termText = term.utf8ToString();
        if (isNumerical(termText))
            continue;

        termStats.add(new TermStats(termText, tf, reader));
        docLen += tf;
    }

    for (TermStats ts : termStats) {
        ts.computeWeight(docLen, qSelLambda);
    }

    Collections.sort(termStats);
    int numTopTerms = (int) (queryToDocRatio * termStats.size());
    numTopTerms = Math.min(numTopTerms, MAX_QUERY_TERMS);
    return termStats.subList(0, numTopTerms);
}

From source file:retriever.TermWt.java

DocVector(IndexReader reader, int docId) throws Exception {
    this.reader = reader;
    Terms terms = reader.getTermVector(docId, FIELD_ANALYZED_CONTENT);

    TermsEnum termsEnum;//from www  . j a  v  a  2 s  . co  m
    BytesRef term;
    List<TermWt> tfvec = new ArrayList<>();

    // Construct the normalized tf vector
    termsEnum = terms.iterator(null); // access the terms for this field
    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        String termStr = term.utf8ToString();
        DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one
        while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            //get the term frequency in the document
            int tf = docsEnum.freq();
            tfvec.add(new TermWt(termStr, tf));
        }
    }

    Collections.sort(tfvec);

    vec = new TermWt[tfvec.size()];
    vec = tfvec.toArray(vec);
}

From source file:retriever.CentroidLinkageSim.java

HashMap<String, WordVec> loadWordClusterInfo(int docId, byte allDocWords) throws Exception {
    String termText;//from w  ww .  ja v  a2 s.c  o  m
    BytesRef term;
    Terms tfvector;
    TermsEnum termsEnum;
    HashMap<Integer, List<WordVec>> clusterMap = new HashMap<>();

    tfvector = reader.getTermVector(docId, TextDocIndexer.FIELD_ANALYZED_CONTENT);

    // Construct the normalized tf vector
    termsEnum = tfvector.iterator(); // access the terms for this field

    int wordId = 0;
    //System.out.println("Getting cluster ids for document " + docId);
    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        wordId++;
        termText = term.utf8ToString();
        int clusterId = allDocWords == ALL_WORDS_AS_SEPARATE_CLUSTERS ? wordId : // each word a new cluster id
                allDocWords == WORDS_AS_CLUSTERS ? WordVecs.getClusterId(termText) : // cluster ids from vocab
                        0; // each word the same cluster id
        if (clusterId < 0)
            continue;

        // Get the term and its cluster id.. Store in a hashmap for
        // computing group-wise centroids
        WordVec wv = WordVecs.getVecCached(termText);
        if (wv == null)
            continue;
        List<WordVec> veclist = clusterMap.get(clusterId);
        if (veclist == null) {
            veclist = new ArrayList<>();
            clusterMap.put(clusterId, veclist);
        }
        veclist.add(wv);
    }
    //System.out.println("Got cluster ids for doc " + docId);

    // Return a list of centroids computed by grouping together the cluster ids
    HashMap<String, WordVec> centroids = new HashMap<>();

    //System.out.println("#clusters in doc " + docId + ": " + clusterMap.size());
    for (Map.Entry<Integer, List<WordVec>> e : clusterMap.entrySet()) {
        List<WordVec> veclist = e.getValue();
        WordVec centroid = WordVecs.getCentroid(veclist);
        centroids.put("Cluster: " + e.getKey(), centroid);
    }

    return centroids;
}

From source file:retriever.TermFreq.java

public String getTfVectorString(int docId) throws Exception {
    Terms terms = reader.getTermVector(docId, FIELD_ANALYZED_CONTENT);
    if (terms == null || terms.size() == 0)
        return "";

    TermsEnum termsEnum;//from w ww.  ja v  a 2s . c o  m
    BytesRef term;
    List<TermFreq> tfvec = new ArrayList<>();

    // Construct the normalized tf vector
    termsEnum = terms.iterator(null); // access the terms for this field
    while ((term = termsEnum.next()) != null) { // explore the terms for this field
        String termStr = term.utf8ToString();
        if (isNumber(termStr))
            continue;
        DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one
        while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
            //get the term frequency in the document
            int tf = docsEnum.freq();
            float idf = numDocsInCollection / (float) reader.docFreq(new Term(FIELD_ANALYZED_CONTENT, term));
            tfvec.add(new TermFreq(termStr, tf, idf));
        }
    }

    Collections.sort(tfvec);
    StringBuffer buff = new StringBuffer();
    for (TermFreq tf : tfvec)
        buff.append(tf.term).append(":").append(tf.tf).append(", ").append(tf.idf).append(" ");

    if (buff.length() > 2) {
        buff.deleteCharAt(buff.length() - 1);
        buff.deleteCharAt(buff.length() - 1);
    }

    return buff.toString();
}

From source file:stemmer.Dictionary.java

License:Apache License

/**
 * Reads the dictionary file through the provided InputStreams, building up the words map
 *
 * @param dictionaries InputStreams to read the dictionary file through
 * @param decoder CharsetDecoder used to decode the contents of the file
 * @throws IOException Can be thrown while reading from the file
 *///from   ww w .  j  a va2s. c o  m
private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words)
        throws IOException {
    BytesRef flagsScratch = new BytesRef();
    IntsRef scratchInts = new IntsRef();

    StringBuilder sb = new StringBuilder();

    File unsorted = File.createTempFile("unsorted", "dat", tempDir);
    ByteSequencesWriter writer = new ByteSequencesWriter(unsorted);
    boolean success = false;
    try {
        for (InputStream dictionary : dictionaries) {
            BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder));
            String line = lines.readLine(); // first line is number of entries (approximately, sometimes)

            while ((line = lines.readLine()) != null) {
                line = unescapeEntry(line);
                if (needsInputCleaning) {
                    int flagSep = line.lastIndexOf(FLAG_SEPARATOR);
                    if (flagSep == -1) {
                        CharSequence cleansed = cleanInput(line, sb);
                        writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8));
                    } else {
                        String text = line.substring(0, flagSep);
                        CharSequence cleansed = cleanInput(text, sb);
                        if (cleansed != sb) {
                            sb.setLength(0);
                            sb.append(cleansed);
                        }
                        sb.append(line.substring(flagSep));
                        writer.write(sb.toString().getBytes(StandardCharsets.UTF_8));
                    }
                } else {
                    writer.write(line.getBytes(StandardCharsets.UTF_8));
                }
            }
        }
        success = true;
    } finally {
        if (success) {
            IOUtils.close(writer);
        } else {
            IOUtils.closeWhileHandlingException(writer);
        }
    }
    File sorted = File.createTempFile("sorted", "dat", tempDir);

    OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() {
        BytesRef scratch1 = new BytesRef();
        BytesRef scratch2 = new BytesRef();

        @Override
        public int compare(BytesRef o1, BytesRef o2) {
            scratch1.bytes = o1.bytes;
            scratch1.offset = o1.offset;
            scratch1.length = o1.length;

            for (int i = scratch1.length - 1; i >= 0; i--) {
                if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR) {
                    scratch1.length = i;
                    break;
                }
            }

            scratch2.bytes = o2.bytes;
            scratch2.offset = o2.offset;
            scratch2.length = o2.length;

            for (int i = scratch2.length - 1; i >= 0; i--) {
                if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR) {
                    scratch2.length = i;
                    break;
                }
            }

            int cmp = scratch1.compareTo(scratch2);
            if (cmp == 0) {
                // tie break on whole row
                return o1.compareTo(o2);
            } else {
                return cmp;
            }
        }
    });
    sorter.sort(unsorted, sorted);
    unsorted.delete();

    ByteSequencesReader reader = new ByteSequencesReader(sorted);
    BytesRef scratchLine = new BytesRef();

    // TODO: the flags themselves can be double-chars (long) or also numeric
    // either way the trick is to encode them as char... but they must be parsed differently

    String currentEntry = null;
    IntsRef currentOrds = new IntsRef();

    String line;
    while (reader.read(scratchLine)) {
        line = scratchLine.utf8ToString();
        String entry;
        char wordForm[];

        int flagSep = line.lastIndexOf(FLAG_SEPARATOR);
        if (flagSep == -1) {
            wordForm = NOFLAGS;
            entry = line;
        } else {
            // note, there can be comments (morph description) after a flag.
            // we should really look for any whitespace: currently just tab and space
            int end = line.indexOf('\t', flagSep);
            if (end == -1)
                end = line.length();
            int end2 = line.indexOf(' ', flagSep);
            if (end2 == -1)
                end2 = line.length();
            end = Math.min(end, end2);

            String flagPart = line.substring(flagSep + 1, end);
            if (aliasCount > 0) {
                flagPart = getAliasValue(Integer.parseInt(flagPart));
            }

            wordForm = flagParsingStrategy.parseFlags(flagPart);
            Arrays.sort(wordForm);
            entry = line.substring(0, flagSep);
        }

        int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry);
        if (cmp < 0) {
            throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry);
        } else {
            encodeFlags(flagsScratch, wordForm);
            int ord = flagLookup.add(flagsScratch);
            if (ord < 0) {
                // already exists in our hash
                ord = (-ord) - 1;
            }
            // finalize current entry, and switch "current" if necessary
            if (cmp > 0 && currentEntry != null) {
                Util.toUTF32(currentEntry, scratchInts);
                words.add(scratchInts, currentOrds);
            }
            // swap current
            if (cmp > 0 || currentEntry == null) {
                currentEntry = entry;
                currentOrds = new IntsRef(); // must be this way
            }
            currentOrds.grow(currentOrds.length + 1);
            currentOrds.ints[currentOrds.length++] = ord;
        }
    }

    // finalize last entry
    Util.toUTF32(currentEntry, scratchInts);
    words.add(scratchInts, currentOrds);

    reader.close();
    sorted.delete();
}

From source file:suonos.lucene.fields.IndexedFieldCountsBuilder.java

License:Apache License

void add(IndexedField fld, Map<String, IndexedFieldTermCount> valuesMap, String filter, BytesRef term,
        int docFreq) {

    String termVal = term.utf8ToString();

    // Case insensitive comparison.
    //// w  w  w.j  a  v a2 s. c om
    String termValLC = TagUtils.convertStringToId(termVal);
    if (filter != null && !termValLC.startsWith(filter)) {
        return;
    }

    IndexedFieldTermCount c = valuesMap.get(termValLC);

    if (c == null) {
        valuesMap.put(termValLC, c = new IndexedFieldTermCount(fld, termVal, termValLC));
    }

    c.docFreq += docFreq;
    System.out.println("term=" + termVal + " " + docFreq);
}

From source file:trustframework.evidence.github.ConversationMimicry.java

private Map<String, Integer> getFrequencyMap(IndexReader ir, Integer docIndex) throws IOException {
    Terms frequencyVector = ir.getTermVector(docIndex, "Content");
    TermsEnum termsIterator = frequencyVector.iterator();
    Map<String, Integer> frequencyMap = new HashMap<>();
    BytesRef text;
    while ((text = termsIterator.next()) != null) {
        String term = text.utf8ToString();
        int freq = (int) termsIterator.totalTermFreq();
        frequencyMap.put(term, freq);//from  w w  w  .  j a  v a  2 s. co  m
    }
    return frequencyMap;
}

From source file:uk.ac.ebi.biostudies.efo.Autocompletion.java

License:Apache License

public List<String> getTerms(String fieldName, int minFreq) throws IOException {
    List<String> termsList = new ArrayList<>();

    try {//  w w  w  .  j  ava 2 s  .co m
        IndexReader reader = indexManager.getIndexReader();
        Terms terms = MultiTerms.getTerms(reader, fieldName);
        if (null != terms) {
            TermsEnum iterator = terms.iterator();
            BytesRef byteRef;
            while ((byteRef = iterator.next()) != null) {
                if (iterator.docFreq() >= minFreq) {
                    termsList.add(byteRef.utf8ToString());
                }
            }
        }
    } catch (Exception ex) {
        logger.error("getTerms problem", ex);
    }
    return termsList;
}

From source file:uk.co.flax.luwak.analysis.TermsEnumTokenStream.java

License:Apache License

@Override
public final boolean incrementToken() throws IOException {
    clearAttributes();//from   w  w  w  .  j  a  v a  2 s  .  co m
    BytesRef bytes = termsEnum.next();
    if (bytes == null)
        return false;
    charTerm.setEmpty();
    charTerm.append(bytes.utf8ToString());
    return true;
}