List of usage examples for org.apache.lucene.index IndexReader getTermVector
public final Terms getTermVector(int docID, String field) throws IOException
From source file:Evaluator.TermFreq.java
public DocVector(String docid, IndexReader reader, IndexSearcher searcher) throws IOException, ParseException { this.docid = docid; this.termVector = new ArrayList<TermFreq>(); Analyzer analyzer = new KeywordAnalyzer(); QueryParser parser = new QueryParser("id", analyzer); Query query = parser.parse(docid); TopDocs topdocs = searcher.search(query, 1); int index = topdocs.scoreDocs[0].doc; Terms terms = reader.getTermVector(index, "words"); TermsEnum termsEnum = null;//from w w w.j a va2s. co m termsEnum = terms.iterator(); BytesRef term; while ((term = termsEnum.next()) != null) { TermFreq tf = new TermFreq(term.utf8ToString(), (int) termsEnum.totalTermFreq()); this.termVector.add(tf); } }
From source file:game.TermFreq.java
void loadTfVec() throws Exception { IndexReader reader = retriever.getReader(); long sumDf = reader.getSumDocFreq(TrecDocRetriever.FIELD_ANALYZED_CONTENT); Terms terms = reader.getTermVector(luceneDocIdToGuess, FIELD_ANALYZED_CONTENT); if (terms == null || terms.size() == 0) return;// www . j a v a 2 s . co m TermsEnum termsEnum; BytesRef term; tfvec = new ArrayList<>(); // Construct the normalized tf vector termsEnum = terms.iterator(null); // access the terms for this field int doclen = 0; while ((term = termsEnum.next()) != null) { // explore the terms for this field String termStr = term.utf8ToString(); String stem = retriever.analyze(termStr); DocsEnum docsEnum = termsEnum.docs(null, null); // enumerate through documents, in this case only one while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { //get the term frequency in the document int tf = docsEnum.freq(); TermFreq tfq = new TermFreq(new Term(TrecDocRetriever.FIELD_ANALYZED_CONTENT, term), termStr, tf); tfvec.add(tfq); doclen += tf; } } for (TermFreq tf : tfvec) { tf.tf = tf.tf / (float) doclen; // normalize by len float idf = sumDf / reader.docFreq(tf.term); tf.wt = (float) (Math.log(1 + LAMBDA / (ONE_MINUS_LAMBDA) * tf.tf * idf)); } Collections.sort(tfvec); }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
@Test public void testIterateThroughDocumentVector() throws Exception { Directory dir = FSDirectory.open(tempDir1); IndexReader reader = DirectoryReader.open(dir); int numDocs = reader.numDocs(); // Iterate through the document vectors for (int i = 0; i < numDocs; i++) { System.out.println(reader.document(i)); Terms terms = reader.getTermVector(i, "text"); TermsEnum te = terms.iterator(); // For this document, iterate through the terms. Term term;// w ww . j a v a2 s .co m while (te.next() != null) { term = new Term("text", te.term()); long tf = te.totalTermFreq(); // Print out the term and its term frequency System.out.println(term.bytes().utf8ToString() + " " + tf); } } }
From source file:io.anserini.integration.IndexerTest.java
License:Apache License
@Test public void testIterateThroughDocumentVectorComputeBM25() throws Exception { Directory dir = FSDirectory.open(tempDir1); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new BM25Similarity()); int numDocs = reader.numDocs(); // Iterate through the document vectors for (int i = 0; i < numDocs; i++) { String docid = reader.document(i).getField("docid").stringValue(); System.out.println(reader.document(i)); System.out.println(i + ": " + docid); Terms terms = reader.getTermVector(i, "text"); TermsEnum te = terms.iterator(); // For this document, iterate through the terms. while (te.next() != null) { String term = new Term("text", te.term()).bytes().utf8ToString(); long tf = te.totalTermFreq(); // The way to compute the BM25 score is to issue a query with the exact docid and the // term in question, and look at the retrieval score. Query filterQuery = new TermQuery(new Term("docid", docid)); // the docid Query termQuery = new TermQuery(new Term("text", term)); // the term BooleanQuery.Builder builder = new BooleanQuery.Builder(); // must have both builder.add(filterQuery, BooleanClause.Occur.MUST); builder.add(termQuery, BooleanClause.Occur.MUST); Query finalQuery = builder.build(); TopDocs rs = searcher.search(finalQuery, 1); // issue the query // The BM25 weight is the maxScore System.out.println(term + " " + tf + " " + rs.getMaxScore()); }//from ww w .j a v a2s . co m } }
From source file:io.anserini.rerank.lib.AxiomReranker.java
License:Apache License
/** * Extract ALL the terms from the documents pool. * * @param docIds The reranking pool, see {@link #selectDocs} for explanations * @param context An instance of RerankerContext * @param filterPattern A Regex pattern that terms are collected only they matches the pattern, could be null * @return A Map of <term -> Set<docId>> kind of a small inverted list where the Set of docIds is where the term occurs *//* w w w . ja v a 2 s . co m*/ private Map<String, Set<Integer>> extractTerms(Set<Integer> docIds, RerankerContext<T> context, Pattern filterPattern) throws Exception, IOException { IndexReader reader; IndexSearcher searcher; if (this.externalIndexPath != null) { Path indexPath = Paths.get(this.externalIndexPath); if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) { throw new IllegalArgumentException( this.externalIndexPath + " does not exist or is not a directory."); } reader = DirectoryReader.open(FSDirectory.open(indexPath)); searcher = new IndexSearcher(reader); } else { searcher = context.getIndexSearcher(); reader = searcher.getIndexReader(); } Map<String, Set<Integer>> termDocidSets = new HashMap<>(); for (int docid : docIds) { Terms terms = reader.getTermVector(docid, LuceneDocumentGenerator.FIELD_BODY); if (terms == null) { LOG.warn("Document vector not stored for docid: " + docid); continue; } TermsEnum te = terms.iterator(); if (te == null) { LOG.warn("Document vector not stored for docid: " + docid); continue; } while ((te.next()) != null) { String term = te.term().utf8ToString(); // We do some noisy filtering here ... pure empirical heuristic if (term.length() < 2) continue; if (!term.matches("[a-z]+")) continue; if (filterPattern == null || filterPattern.matcher(term).matches()) { if (!termDocidSets.containsKey(term)) { termDocidSets.put(term, new HashSet<>()); } termDocidSets.get(term).add(docid); } } } return termDocidSets; }
From source file:io.anserini.rerank.lib.Rm3Reranker.java
License:Apache License
private FeatureVector estimateRelevanceModel(ScoredDocuments docs, IndexReader reader, boolean tweetsearch) { FeatureVector f = new FeatureVector(); Set<String> vocab = new HashSet<>(); int numdocs = docs.documents.length < fbDocs ? docs.documents.length : fbDocs; FeatureVector[] docvectors = new FeatureVector[numdocs]; for (int i = 0; i < numdocs; i++) { try {/*from www . j ava2s .c om*/ FeatureVector docVector = createdFeatureVector(reader.getTermVector(docs.ids[i], field), reader, tweetsearch); docVector.pruneToSize(fbTerms); vocab.addAll(docVector.getFeatures()); docvectors[i] = docVector; } catch (IOException e) { e.printStackTrace(); // Just return empty feature vector. return f; } } // Precompute the norms once and cache results. float[] norms = new float[docvectors.length]; for (int i = 0; i < docvectors.length; i++) { norms[i] = (float) docvectors[i].computeL1Norm(); } for (String term : vocab) { float fbWeight = 0.0f; for (int i = 0; i < docvectors.length; i++) { fbWeight += (docvectors[i].getFeatureWeight(term) / norms[i]) * docs.scores[i]; } f.addFeatureWeight(term, fbWeight); } f.pruneToSize(fbTerms); f.scaleToUnitL1Norm(); return f; }
From source file:ir.project.TFIDFMatrix.java
private void createTermMap() { try {/*from w w w.j ava 2 s .c o m*/ IndexReader reader = DirectoryReader.open(this.index); this.termMap = new HashMap<>(); // Map used to identify position in matrix for this.numDocs = reader.maxDoc(); int count = 0; // Setup the termMap for (int i = 0; i < numDocs; i++) { Terms vector = reader.getTermVector(i, "text"); if (vector == null) { System.err.println("Vector is null!"); continue; } TermsEnum it = vector.iterator(); while (it.next() != null) { Term t = new Term("text", it.term().utf8ToString()); if (!termMap.containsKey(it.term().utf8ToString())) { termMap.put(it.term().utf8ToString(), count); count += 1; } } } this.numTerms = count; reader.close(); } catch (IOException ex) { Logger.getLogger(TFIDFMatrix.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:ir.project.TFIDFMatrix.java
private void createMatrix() { try {/*w w w. j a v a 2 s . c o m*/ this.matrix = new TFIDFBookVector[numDocs]; IndexReader reader = DirectoryReader.open(this.index); for (int i = 0; i < numDocs; i++) { Terms vector = reader.getTermVector(i, "text"); // get title IndexableField titleField = reader.document(i).getField("title"); String title = titleField.stringValue(); // get isbn IndexableField isbnField = reader.document(i).getField("isbn"); String isbn = isbnField.stringValue(); // get author IndexableField authorField = reader.document(i).getField("author"); String author = authorField.stringValue(); this.matrix[i] = new TFIDFBookVector(numTerms, title, isbn, author); if (vector == null) { System.err.println("Vector is null"); continue; } TermsEnum it = vector.iterator(); while (it.next() != null) { Term t = new Term("text", it.term().utf8ToString()); // TotalTermFreq returns frequency of term in document. Long tf = it.totalTermFreq(); double idf = (double) 1 / (double) reader.totalTermFreq(t); double tfIdfWeight = tf * idf; // put TF-IDF weight in matrix int termIndex = this.termMap.get(it.term().utf8ToString()); this.matrix[i].editValue(termIndex, tfIdfWeight); } } reader.close(); } catch (IOException ex) { Logger.getLogger(TFIDFMatrix.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:it.cnr.isti.hpc.dexter.lucene.LuceneHelper.java
License:Apache License
/** * Returns the cosine similarity between two documents * /*w w w . ja v a 2 s .c o m*/ * @param x * - the WikiId of the first document * @param y * - the WikiId of the first document * @param field * - the field on which to compute the similarity * * @return a double between 0 (not similar) and 1 (same content), * representing the similarity between the 2 documents */ public double getCosineSimilarity(int x, int y, String field) { IndexReader reader = getReader(); Terms tfvX = null; Terms tfvY = null; try { tfvX = reader.getTermVector(getLuceneId(x), field); tfvY = reader.getTermVector(getLuceneId(y), field); // try { // tfvX = reader.document(idX).getBinaryValue("asd") // getTermFreqVectors(idX); // tfvY = reader.getTermFreqVectors(idY); } catch (IOException e) { logger.error("computing cosine similarity ({}) ", e.toString()); System.exit(-1); } Map<String, Integer> xfrequencies = new HashMap<String, Integer>(); Map<String, Integer> yfrequencies = new HashMap<String, Integer>(); TermsEnum xtermsEnum = null; try { xtermsEnum = tfvX.iterator(null); BytesRef text; while ((text = xtermsEnum.next()) != null) { String term = text.utf8ToString(); int freq = (int) xtermsEnum.totalTermFreq(); xfrequencies.put(term, freq); } TermsEnum ytermsEnum = tfvY.iterator(null); while ((text = ytermsEnum.next()) != null) { String term = text.utf8ToString(); int freq = (int) ytermsEnum.totalTermFreq(); yfrequencies.put(term, freq); } } catch (IOException e) { logger.error("computing cosine similarity ({}) ", e.toString()); System.exit(-1); } Map<String, Double> xTfidf = new HashMap<String, Double>(); Map<String, Double> yTfidf = new HashMap<String, Double>(); double xnorm = tfidfVector(xTfidf, xfrequencies, field); double ynorm = tfidfVector(yTfidf, yfrequencies, field); double dotproduct = 0; for (Map.Entry<String, Double> k : xTfidf.entrySet()) { if (yTfidf.containsKey(k.getKey())) { logger.info("key {}", k.getKey()); logger.info("key x {} y {} ", k.getValue(), yTfidf.get(k.getKey())); dotproduct += k.getValue() * yTfidf.get(k.getKey()); logger.info("dotproduct {} ", dotproduct); } } return dotproduct / (xnorm * ynorm); }
From source file:lia.chapter5.CategorizerTest.java
License:Apache License
private void buildCategoryVectors() throws IOException { IndexSearcher searcher = Utils.getBookIndexSearcher(); IndexReader reader = searcher.getIndexReader(); int maxDoc = reader.maxDoc(); System.out.println(maxDoc);//from w w w. ja v a 2 s.c o m for (int i = 0; i < maxDoc; i++) { Document doc = reader.document(i); String category = doc.get("category"); System.out.println("\n" + doc.get("subject") + "\n"); Map vectorMap = (Map) categoryMap.get(category); if (vectorMap == null) { vectorMap = new TreeMap(); categoryMap.put(category, vectorMap); } Terms termsVector = reader.getTermVector(i, "subject"); addTermFreqToMap(vectorMap, termsVector); } }