List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:retrievability.SampledQuery.java
ArrayList<String> collectTerms(String indexName) throws Exception { ArrayList<String> termArray = new ArrayList<>(); File indexDir = new File(prop.getProperty(indexName)); try (IndexReader indexReader = DirectoryReader.open(FSDirectory.open(indexDir))) { Terms terms = MultiFields.getTerms(indexReader, TweetIndexer.FIELD_ANALYZED_CONTENT); for (TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY);;) { BytesRef text = termsEnum.next(); //System.err.println(text.utf8ToString()); if (text == null) break; termArray.add(text.utf8ToString()); }/*from w w w . j a v a 2 s. c om*/ } return termArray; }
From source file:retriever.TermStats.java
public List<TermStats> build() throws Exception { String termText;/* w ww . j a v a 2 s . c o m*/ BytesRef term; Terms tfvector; TermsEnum termsEnum; int docLen = 0; int tf; tfvector = reader.getTermVector(docId, TextDocIndexer.FIELD_ANALYZED_CONTENT); if (tfvector == null || tfvector.size() == 0) return null; // Construct the normalized tf vector termsEnum = tfvector.iterator(); // access the terms for this field while ((term = termsEnum.next()) != null) { // explore the terms for this field tf = (int) termsEnum.totalTermFreq(); termText = term.utf8ToString(); if (isNumerical(termText)) continue; termStats.add(new TermStats(termText, tf, reader)); docLen += tf; } for (TermStats ts : termStats) { ts.computeWeight(docLen, qSelLambda); } Collections.sort(termStats); int numTopTerms = (int) (queryToDocRatio * termStats.size()); numTopTerms = Math.min(numTopTerms, MAX_QUERY_TERMS); return termStats.subList(0, numTopTerms); }
From source file:retriever.TermWt.java
DocVector(IndexReader reader, int docId) throws Exception { this.reader = reader; Terms terms = reader.getTermVector(docId, FIELD_ANALYZED_CONTENT); TermsEnum termsEnum;//from www . j a v a 2 s . co m BytesRef term; List<TermWt> tfvec = new ArrayList<>(); // Construct the normalized tf vector termsEnum = terms.iterator(null); // access the terms for this field while ((term = termsEnum.next()) != null) { // explore the terms for this field String termStr = term.utf8ToString(); DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { //get the term frequency in the document int tf = docsEnum.freq(); tfvec.add(new TermWt(termStr, tf)); } } Collections.sort(tfvec); vec = new TermWt[tfvec.size()]; vec = tfvec.toArray(vec); }
From source file:retriever.CentroidLinkageSim.java
HashMap<String, WordVec> loadWordClusterInfo(int docId, byte allDocWords) throws Exception { String termText;//from w ww . ja v a2 s.c o m BytesRef term; Terms tfvector; TermsEnum termsEnum; HashMap<Integer, List<WordVec>> clusterMap = new HashMap<>(); tfvector = reader.getTermVector(docId, TextDocIndexer.FIELD_ANALYZED_CONTENT); // Construct the normalized tf vector termsEnum = tfvector.iterator(); // access the terms for this field int wordId = 0; //System.out.println("Getting cluster ids for document " + docId); while ((term = termsEnum.next()) != null) { // explore the terms for this field wordId++; termText = term.utf8ToString(); int clusterId = allDocWords == ALL_WORDS_AS_SEPARATE_CLUSTERS ? wordId : // each word a new cluster id allDocWords == WORDS_AS_CLUSTERS ? WordVecs.getClusterId(termText) : // cluster ids from vocab 0; // each word the same cluster id if (clusterId < 0) continue; // Get the term and its cluster id.. Store in a hashmap for // computing group-wise centroids WordVec wv = WordVecs.getVecCached(termText); if (wv == null) continue; List<WordVec> veclist = clusterMap.get(clusterId); if (veclist == null) { veclist = new ArrayList<>(); clusterMap.put(clusterId, veclist); } veclist.add(wv); } //System.out.println("Got cluster ids for doc " + docId); // Return a list of centroids computed by grouping together the cluster ids HashMap<String, WordVec> centroids = new HashMap<>(); //System.out.println("#clusters in doc " + docId + ": " + clusterMap.size()); for (Map.Entry<Integer, List<WordVec>> e : clusterMap.entrySet()) { List<WordVec> veclist = e.getValue(); WordVec centroid = WordVecs.getCentroid(veclist); centroids.put("Cluster: " + e.getKey(), centroid); } return centroids; }
From source file:retriever.TermFreq.java
public String getTfVectorString(int docId) throws Exception { Terms terms = reader.getTermVector(docId, FIELD_ANALYZED_CONTENT); if (terms == null || terms.size() == 0) return ""; TermsEnum termsEnum;//from w ww. ja v a 2s . c o m BytesRef term; List<TermFreq> tfvec = new ArrayList<>(); // Construct the normalized tf vector termsEnum = terms.iterator(null); // access the terms for this field while ((term = termsEnum.next()) != null) { // explore the terms for this field String termStr = term.utf8ToString(); if (isNumber(termStr)) continue; DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { //get the term frequency in the document int tf = docsEnum.freq(); float idf = numDocsInCollection / (float) reader.docFreq(new Term(FIELD_ANALYZED_CONTENT, term)); tfvec.add(new TermFreq(termStr, tf, idf)); } } Collections.sort(tfvec); StringBuffer buff = new StringBuffer(); for (TermFreq tf : tfvec) buff.append(tf.term).append(":").append(tf.tf).append(", ").append(tf.idf).append(" "); if (buff.length() > 2) { buff.deleteCharAt(buff.length() - 1); buff.deleteCharAt(buff.length() - 1); } return buff.toString(); }
From source file:stemmer.Dictionary.java
License:Apache License
/** * Reads the dictionary file through the provided InputStreams, building up the words map * * @param dictionaries InputStreams to read the dictionary file through * @param decoder CharsetDecoder used to decode the contents of the file * @throws IOException Can be thrown while reading from the file *///from ww w . j a va2s. c o m private void readDictionaryFiles(List<InputStream> dictionaries, CharsetDecoder decoder, Builder<IntsRef> words) throws IOException { BytesRef flagsScratch = new BytesRef(); IntsRef scratchInts = new IntsRef(); StringBuilder sb = new StringBuilder(); File unsorted = File.createTempFile("unsorted", "dat", tempDir); ByteSequencesWriter writer = new ByteSequencesWriter(unsorted); boolean success = false; try { for (InputStream dictionary : dictionaries) { BufferedReader lines = new BufferedReader(new InputStreamReader(dictionary, decoder)); String line = lines.readLine(); // first line is number of entries (approximately, sometimes) while ((line = lines.readLine()) != null) { line = unescapeEntry(line); if (needsInputCleaning) { int flagSep = line.lastIndexOf(FLAG_SEPARATOR); if (flagSep == -1) { CharSequence cleansed = cleanInput(line, sb); writer.write(cleansed.toString().getBytes(StandardCharsets.UTF_8)); } else { String text = line.substring(0, flagSep); CharSequence cleansed = cleanInput(text, sb); if (cleansed != sb) { sb.setLength(0); sb.append(cleansed); } sb.append(line.substring(flagSep)); writer.write(sb.toString().getBytes(StandardCharsets.UTF_8)); } } else { writer.write(line.getBytes(StandardCharsets.UTF_8)); } } } success = true; } finally { if (success) { IOUtils.close(writer); } else { IOUtils.closeWhileHandlingException(writer); } } File sorted = File.createTempFile("sorted", "dat", tempDir); OfflineSorter sorter = new OfflineSorter(new Comparator<BytesRef>() { BytesRef scratch1 = new BytesRef(); BytesRef scratch2 = new BytesRef(); @Override public int compare(BytesRef o1, BytesRef o2) { scratch1.bytes = o1.bytes; scratch1.offset = o1.offset; scratch1.length = o1.length; for (int i = scratch1.length - 1; i >= 0; i--) { if (scratch1.bytes[scratch1.offset + i] == FLAG_SEPARATOR) { scratch1.length = i; break; } } scratch2.bytes = o2.bytes; scratch2.offset = o2.offset; scratch2.length = o2.length; for (int i = scratch2.length - 1; i >= 0; i--) { if (scratch2.bytes[scratch2.offset + i] == FLAG_SEPARATOR) { scratch2.length = i; break; } } int cmp = scratch1.compareTo(scratch2); if (cmp == 0) { // tie break on whole row return o1.compareTo(o2); } else { return cmp; } } }); sorter.sort(unsorted, sorted); unsorted.delete(); ByteSequencesReader reader = new ByteSequencesReader(sorted); BytesRef scratchLine = new BytesRef(); // TODO: the flags themselves can be double-chars (long) or also numeric // either way the trick is to encode them as char... but they must be parsed differently String currentEntry = null; IntsRef currentOrds = new IntsRef(); String line; while (reader.read(scratchLine)) { line = scratchLine.utf8ToString(); String entry; char wordForm[]; int flagSep = line.lastIndexOf(FLAG_SEPARATOR); if (flagSep == -1) { wordForm = NOFLAGS; entry = line; } else { // note, there can be comments (morph description) after a flag. // we should really look for any whitespace: currently just tab and space int end = line.indexOf('\t', flagSep); if (end == -1) end = line.length(); int end2 = line.indexOf(' ', flagSep); if (end2 == -1) end2 = line.length(); end = Math.min(end, end2); String flagPart = line.substring(flagSep + 1, end); if (aliasCount > 0) { flagPart = getAliasValue(Integer.parseInt(flagPart)); } wordForm = flagParsingStrategy.parseFlags(flagPart); Arrays.sort(wordForm); entry = line.substring(0, flagSep); } int cmp = currentEntry == null ? 1 : entry.compareTo(currentEntry); if (cmp < 0) { throw new IllegalArgumentException("out of order: " + entry + " < " + currentEntry); } else { encodeFlags(flagsScratch, wordForm); int ord = flagLookup.add(flagsScratch); if (ord < 0) { // already exists in our hash ord = (-ord) - 1; } // finalize current entry, and switch "current" if necessary if (cmp > 0 && currentEntry != null) { Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts, currentOrds); } // swap current if (cmp > 0 || currentEntry == null) { currentEntry = entry; currentOrds = new IntsRef(); // must be this way } currentOrds.grow(currentOrds.length + 1); currentOrds.ints[currentOrds.length++] = ord; } } // finalize last entry Util.toUTF32(currentEntry, scratchInts); words.add(scratchInts, currentOrds); reader.close(); sorted.delete(); }
From source file:suonos.lucene.fields.IndexedFieldCountsBuilder.java
License:Apache License
void add(IndexedField fld, Map<String, IndexedFieldTermCount> valuesMap, String filter, BytesRef term, int docFreq) { String termVal = term.utf8ToString(); // Case insensitive comparison. //// w w w.j a v a2 s. c om String termValLC = TagUtils.convertStringToId(termVal); if (filter != null && !termValLC.startsWith(filter)) { return; } IndexedFieldTermCount c = valuesMap.get(termValLC); if (c == null) { valuesMap.put(termValLC, c = new IndexedFieldTermCount(fld, termVal, termValLC)); } c.docFreq += docFreq; System.out.println("term=" + termVal + " " + docFreq); }
From source file:trustframework.evidence.github.ConversationMimicry.java
private Map<String, Integer> getFrequencyMap(IndexReader ir, Integer docIndex) throws IOException { Terms frequencyVector = ir.getTermVector(docIndex, "Content"); TermsEnum termsIterator = frequencyVector.iterator(); Map<String, Integer> frequencyMap = new HashMap<>(); BytesRef text; while ((text = termsIterator.next()) != null) { String term = text.utf8ToString(); int freq = (int) termsIterator.totalTermFreq(); frequencyMap.put(term, freq);//from w w w . j a v a 2 s. co m } return frequencyMap; }
From source file:uk.ac.ebi.biostudies.efo.Autocompletion.java
License:Apache License
public List<String> getTerms(String fieldName, int minFreq) throws IOException { List<String> termsList = new ArrayList<>(); try {// w w w . j ava 2 s .co m IndexReader reader = indexManager.getIndexReader(); Terms terms = MultiTerms.getTerms(reader, fieldName); if (null != terms) { TermsEnum iterator = terms.iterator(); BytesRef byteRef; while ((byteRef = iterator.next()) != null) { if (iterator.docFreq() >= minFreq) { termsList.add(byteRef.utf8ToString()); } } } } catch (Exception ex) { logger.error("getTerms problem", ex); } return termsList; }
From source file:uk.co.flax.luwak.analysis.TermsEnumTokenStream.java
License:Apache License
@Override public final boolean incrementToken() throws IOException { clearAttributes();//from w w w . j a v a 2 s . co m BytesRef bytes = termsEnum.next(); if (bytes == null) return false; charTerm.setEmpty(); charTerm.append(bytes.utf8ToString()); return true; }