List of usage examples for org.apache.lucene.util BytesRef utf8ToString
public String utf8ToString()
From source file:util.AllTerms.java
public void initAllTerms() throws IOException { int pos = 0;//from ww w . jav a 2s . c om for (int docId = 0; docId < totalNoOfDocumentInIndex; docId++) { Terms vector = indexReader.getTermVector(docId, Configuration.FIELD_CONTENT); TermsEnum termsEnum = null; termsEnum = vector.iterator(termsEnum); BytesRef text = null; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); allTerms.put(term, pos++); } } //Update postition pos = 0; for (Iterator it = allTerms.entrySet().iterator(); it.hasNext();) { Entry<String, Integer> s = (Entry<String, Integer>) it.next(); System.out.println(s.getKey()); s.setValue(pos++); } }
From source file:util.VectorGenerator.java
public DocVector[] GetDocumentVectors() throws IOException { for (int docId = 0; docId < totalNoOfDocumentInIndex; docId++) { Terms vector = indexReader.getTermVector(docId, Configuration.FIELD_CONTENT); TermsEnum termsEnum = null;// w w w .j av a 2s . c o m termsEnum = vector.iterator(termsEnum); BytesRef text = null; docVector[docId] = new DocVector(allterms); while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int freq = (int) termsEnum.totalTermFreq(); docVector[docId].setEntry(term, freq); } docVector[docId].normalize(); } indexReader.close(); return docVector; }
From source file:vectorizer.TermInfo.java
private DocVector buildTerms(IndexReader reader, int docId, int numDocs, Dictionary dict) throws Exception { DocVector wmap = new DocVector(reader.document(docId).get(ID_FIELD_NAME)); Terms tfvector;/*from w w w.ja va 2 s .c o m*/ TermsEnum termsEnum; String termText; BytesRef term; int tf; float idf; tfvector = reader.getTermVector(docId, CONTENT_FIELD_NAME); if (tfvector == null) return null; // Construct the normalized tf vector termsEnum = tfvector.iterator(); // access the terms for this field while ((term = termsEnum.next()) != null) { // explore the terms for this field tf = (int) termsEnum.totalTermFreq(); termText = term.utf8ToString(); float df = reader.docFreq(new Term(CONTENT_FIELD_NAME, termText)); idf = (float) Math.log(1 + numDocs / df); TermInfo termInfo = new TermInfo(termText, tf, getTermId(termText), idf); if (dict != null) { Translations translations = dict.getTranslationTerms(termText); for (TranslationInfo tinfo : translations.getTranslationInfo()) { termInfo.tf *= tinfo.weight; } } // Update global stats TermInfo seenTermInfo = collFreq.get(termText); if (seenTermInfo == null) { seenTermInfo = new TermInfo(termInfo.term, termInfo.tf, termInfo.id, termInfo.idf); collFreq.put(termText, seenTermInfo); } else { seenTermInfo.tf += termInfo.tf; // coll freq } wmap.addTermInfo(termInfo); } return wmap; }
From source file:yasoco.TermScore.java
List<TermScore> selTerms(int docId, String fieldName, Query q) throws Exception { int num_q_terms = Integer.parseInt(prop.getProperty("num_q_terms", "10")); int N = reader.numDocs(); List<TermScore> tlist = new Vector<>(); Terms terms = reader.getTermVector(docId, fieldName); //get terms vectors for one document and one field if (terms == null || terms.size() == 0) return tlist; TermsEnum termsEnum = terms.iterator(null); // access the terms for this field BytesRef term = null; int docLen = 0; while ((term = termsEnum.next()) != null) {// explore the terms for this field DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one int docIdEnum; while ((docIdEnum = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //get the term frequency in the document docLen += docsEnum.freq();/* w ww. j av a2 s.co m*/ } } termsEnum = terms.iterator(null); // access the terms for this field while ((term = termsEnum.next()) != null) {// explore the terms for this field Term t = new Term(fieldName, term); DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one int docIdEnum; while ((docIdEnum = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { //get the term frequency in the document int tf = docsEnum.freq(); float ntf = tf / (float) docLen; int df = (int) (reader.totalTermFreq(t)); float idf = N / (float) df; float tf_idf = lambda * ntf + (1 - lambda) * idf; tlist.add(new TermScore(term.utf8ToString(), tf_idf)); } } Collections.sort(tlist); // desc List<TermScore> topList = tlist.subList(0, Math.min(tlist.size(), num_q_terms)); return topList; }