List of usage examples for org.apache.lucene.index IndexReader getTermVector
public final Terms getTermVector(int docID, String field) throws IOException
From source file:ci6226.facetsearch.java
public static void main(String[] args) throws Exception { String index = "./myindex"; String field = "text"; String queries = null;//from ww w . j av a 2 s. com int hitsPerPage = 10; boolean raw = false; //http://lucene.apache.org/core/4_0_0/facet/org/apache/lucene/facet/doc-files/userguide.html#facet_accumulation IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index))); IndexSearcher searcher = new IndexSearcher(reader); // :Post-Release-Update-Version.LUCENE_XY: //TODO: SAME AS HOW U BUILD INDEX Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); BufferedReader in = null; if (queries != null) { in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8")); } else { in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); } // :Post-Release-Update-Version.LUCENE_XY: QueryParser parser = new QueryParser(Version.LUCENE_47, field, analyzer); while (true) { System.out.println("Enter query: "); String line = in.readLine(); line = line.trim(); if (line.length() == 0) { break; } Query query = parser.parse(line); System.out.println("Searching for: " + query.toString(field)); Date start = new Date(); searcher.search(query, null, 100); Date end = new Date(); System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms"); TopDocs results = searcher.search(query, 5 * hitsPerPage); ScoreDoc[] hits = results.scoreDocs; int numTotalHits = results.totalHits; //N= max docs //df = totoal matched doc //idf=log(N/df) for (int i = 0; i < hits.length; i++) { Document doc = searcher.doc(hits[i].doc); System.out.println(ANSI_BLUE + (i + 1) + ANSI_RESET + "\nScore=\t" + hits[i].score); String rtext = doc.get(field); System.out.println("Text=\t" + rtext); Terms vector = reader.getTermVector(i, "text"); if (vector == null) continue; // System.out.println(vector.getSumDocFreq()); // Terms vector = reader.getTermVector(hits[i].doc, field); //hits[i].doc=docID TermsEnum termsEnum = vector.iterator(null); termsEnum = vector.iterator(termsEnum); Map<String, Integer> frequencies = new HashMap<>(); BytesRef text = null; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); int freq = (int) termsEnum.totalTermFreq(); frequencies.put(term, freq); // System.out.println("Time: "+term + " idef "+freq); } } // String[] facetCatlog={""}; System.out.println(numTotalHits + " total matching documents"); } reader.close(); }
From source file:com.globalsight.ling.lucene.HighFreqTerms.java
License:Apache License
public static void main(String[] args) throws Exception { IndexReader reader = null; if (args.length == 1) { SimpleFSDirectory fsd = new SimpleFSDirectory(new File(args[0])); reader = DirectoryReader.open(fsd); } else {// ww w . j a v a2s . com usage(); System.exit(1); } TermInfoQueue tiq = new TermInfoQueue(numTerms); //TODO: IS field right? String field = IndexDocument.TEXT; Terms terms = reader.getTermVector(0, field); //TermEnum terms = reader.terms(); TermsEnum termsEnum = terms.iterator(null); BytesRef next = null; while ((next = termsEnum.next()) != null) { tiq.insertWithOverflow(new TermInfo(new Term(field, termsEnum.term()), termsEnum.docFreq())); } while (tiq.size() != 0) { TermInfo termInfo = (TermInfo) tiq.pop(); System.out.println(termInfo.term + " " + termInfo.docFreq); } reader.close(); }
From source file:com.mathworks.xzheng.advsearching.CategorizerTest.java
License:Apache License
private void buildCategoryVectors() throws IOException { IndexReader reader = IndexReader.open(TestUtil.getBookIndexDirectory()); int maxDoc = reader.maxDoc(); for (int i = 0; i < maxDoc; i++) { if (!reader.document(i)) { Document doc = reader.document(i); String category = doc.get("category"); Map vectorMap = (Map) categoryMap.get(category); if (vectorMap == null) { vectorMap = new TreeMap(); categoryMap.put(category, vectorMap); }//ww w. j a v a 2s. com Terms terms = reader.getTermVector(i, "subject"); addTermFreqToMap(vectorMap, terms); } } }
From source file:com.o19s.solr.swan.highlight.TermVectorFun.java
License:Apache License
@Test public void testBlah() throws IOException { RAMDirectory ramDir = new RAMDirectory(); // Index some made up content IndexWriterConfig iwf = new IndexWriterConfig(Version.LUCENE_47, new StandardAnalyzer(Version.LUCENE_47)); IndexWriter writer = new IndexWriter(ramDir, iwf); FieldType ft = new FieldType(); ft.setIndexed(true);/* w w w . j a v a 2 s . co m*/ ft.setTokenized(true); ft.setStored(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.freeze(); for (int i = 0; i < DOCS.length; i++) { Document doc = new Document(); StringField id = new StringField("id", "doc_" + i, StringField.Store.YES); doc.add(id); // Store both position and offset information Field text = new Field("content", DOCS[i], ft); // Field.Index.ANALYZED, // Field.TermVector.WITH_POSITIONS_OFFSETS); doc.add(text); writer.addDocument(doc); } //writer.close(); // Get a searcher AtomicReader dr = SlowCompositeReaderWrapper.wrap(DirectoryReader.open(writer, true)); IndexSearcher searcher = new IndexSearcher(dr); // Do a search using SpanQuery SpanTermQuery fleeceQ = new SpanTermQuery(new Term("content", "fleece")); TopDocs results = searcher.search(fleeceQ, 10); for (int i = 0; i < results.scoreDocs.length; i++) { ScoreDoc scoreDoc = results.scoreDocs[i]; System.out.println("Score Doc: " + scoreDoc); } IndexReader reader = searcher.getIndexReader(); Bits acceptDocs = null; Map<Term, TermContext> termContexts = new HashMap<Term, TermContext>(); Spans spans = fleeceQ.getSpans(dr.getContext(), acceptDocs, termContexts); while (spans.next()) { System.out.println("Doc: " + spans.doc() + " Start: " + spans.start() + " End: " + spans.end()); DocumentStoredFieldVisitor visitor = new DocumentStoredFieldVisitor("content"); reader.document(spans.doc(), visitor); Terms terms = reader.getTermVector(spans.doc(), "content"); TermsEnum tenum = terms.iterator(null); // AttributeSource as = tenum.attributes(); while (tenum.next() != null) { System.out.println(tenum.term().utf8ToString()); } for (long pos = 0L; pos < spans.end(); pos++) { // tenum.next(); // if (tenum.ord()<pos) continue; // System.out.println(tenum.term()); // } reader.document(spans.doc(), visitor); // String[] values = visitor.getDocument().getValues("content"); // List<String> a = new ArrayList<String>(); // // build up the window // tvm.start = spans.start() - window; // tvm.end = spans.end() + window; // reader.getTermFreqVector(spans.doc(), "content", tvm); // for (WindowEntry entry : tvm.entries.values()) { // System.out.println("Entry: " + entry); // } // // clear out the entries for the next round // tvm.entries.clear(); } }
From source file:com.romeikat.datamessie.core.processing.service.fulltext.query.QueryUtil.java
License:Open Source License
public List<String> getIndexTerms(final FullTextSession fullTextSession, final int luceneDocumentId, final Class<?> clazz, final String field) { final IndexReader indexReader = fullTextSession.getSearchFactory().getIndexReaderAccessor().open(clazz); try {//from ww w . j a va 2 s. c om final Terms terms = indexReader.getTermVector(luceneDocumentId, field); final List<String> termsList = Lists.newArrayListWithExpectedSize((int) terms.size()); final TermsEnum termsEnum = terms.iterator(); BytesRef text; while ((text = termsEnum.next()) != null) { final String term = text.utf8ToString(); termsList.add(term); } return termsList; } catch (final IOException e) { LOG.error("Could not determine index terms", e); return null; } }
From source file:Dl4j.TermInfo.java
protected Map<String, TermInfo> buildTerms(IndexReader reader, int docId) throws Exception { Map<String, TermInfo> wmap = new HashMap<>(); Terms tfvector;//from w w w. ja v a 2s . c o m TermsEnum termsEnum; String termText; BytesRef term; tfvector = reader.getTermVector(docId, CONTENET_FIELD_NAME); // Construct the normalized tf vector termsEnum = tfvector.iterator(); // access the terms for this field while ((term = termsEnum.next()) != null) { // explore the terms for this field termText = term.utf8ToString(); TermInfo termInfo = wmap.get(termText); if (termInfo == null) { termInfo = new TermInfo(termText, getTermId(termText)); } termInfo.tf++; wmap.put(termText, termInfo); } return wmap; }
From source file:edu.cuhk.hccl.cmd.AppSearchEngine.java
License:Apache License
public static void main(String[] args) throws IOException { // Get parameters CommandLineParser parser = new BasicParser(); Options options = createOptions();// w w w. j ava 2s. c om File dataFolder = null; String queryStr = null; int topK = 0; File resultFile = null; String queryType = null; File similarityFile = null; try { CommandLine line = parser.parse(options, args); dataFolder = new File(line.getOptionValue('d')); queryStr = line.getOptionValue('q'); queryType = line.getOptionValue('t'); topK = Integer.parseInt(line.getOptionValue('k')); resultFile = new File(line.getOptionValue('f')); similarityFile = new File(line.getOptionValue('s')); if (line.hasOption('m')) { String modelPath = line.getOptionValue('m'); if (queryType.equalsIgnoreCase("WordVector")) { expander = new WordVectorExpander(modelPath); } else if (queryType.equalsIgnoreCase("WordNet")) { expander = new WordNetExpander(modelPath); } else { System.out.println("Please choose a correct expander: WordNet or WordVector!"); System.exit(-1); } } } catch (ParseException exp) { System.out.println("Error in parameters: \n" + exp.getMessage()); System.exit(-1); } // Create Index StandardAnalyzer analyzer = new StandardAnalyzer(); Directory index = createIndex(dataFolder, analyzer); // Build query Query query = buildQuery(analyzer, queryStr, queryType); // Search index for topK hits IndexReader reader = DirectoryReader.open(index); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(topK, true); searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // Show search results System.out.println("\n[INFO] " + hits.length + " hits were returned:"); List<String> hitLines = new ArrayList<String>(); for (int i = 0; i < hits.length; i++) { int docId = hits[i].doc; Document d = searcher.doc(docId); String line = (i + 1) + "\t" + d.get(PATH_FIELD) + "\t" + hits[i].score; System.out.println(line); hitLines.add(line); } // Compute cosine similarity between documents List<String> simLines = new ArrayList<String>(); for (int m = 0; m < hits.length; m++) { int doc1 = hits[m].doc; Terms terms1 = reader.getTermVector(doc1, CONTENT_FIELD); for (int n = m + 1; n < hits.length; n++) { int doc2 = hits[n].doc; Terms terms2 = reader.getTermVector(doc2, CONTENT_FIELD); CosineDocumentSimilarity cosine = new CosineDocumentSimilarity(terms1, terms2); double similarity = cosine.getCosineSimilarity(); String line = searcher.doc(doc1).get(PATH_FIELD) + "\t" + searcher.doc(doc2).get(PATH_FIELD) + "\t" + similarity; simLines.add(line); } } // Release resources reader.close(); if (expander != null) { expander.close(); } // Save search results System.out.println("\n[INFO] Search results are saved in file: " + resultFile.getPath()); FileUtils.writeLines(resultFile, hitLines, false); System.out.println("\n[INFO] Cosine similarities are saved in file: " + similarityFile.getPath()); FileUtils.writeLines(similarityFile, simLines, false); }
From source file:edu.illinois.cs.cogcomp.bigdata.lucene.Lucene.java
License:Open Source License
/** * returns term freq for a given doc.//from ww w .j a v a 2 s. c om * * @param reader * @param field * @return * @throws IOException */ public static Map<String, Float> getTfs(IndexReader reader, String field, int docID) throws IOException { Map<String, Float> termFrequencies = new HashMap<>(); Terms terms = reader.getTermVector(docID, field); TermsEnum itr = terms.iterator(); BytesRef term = null; while ((term = itr.next()) != null) { String termText = term.utf8ToString(); long termFreq = itr.totalTermFreq(); // term freq in doc with docID termFrequencies.put(termText, (float) termFreq); } return termFrequencies; }
From source file:edu.utsa.sifter.Result.java
License:Apache License
public DocTermInfo docRankFactors(final double[] features, final Date refDate, final IndexReader rdr, final Set<Term> termSet) throws IOException { // J.S.//from w ww .jav a2 s . c om final double[] featuresA = new double[19]; final DocTermInfo ret = new DocTermInfo(); final String lowerExt = Extension.toLowerCase(); if (!isUnallocated()) { features[HitRanker.FCREATED] = dateDiff(Created, refDate); features[HitRanker.FMODIFIED] = dateDiff(Modified, refDate); features[HitRanker.FACCESSED] = dateDiff(Accessed, refDate); features[HitRanker.FAVG_RECENCY] = (features[HitRanker.FCREATED] + features[HitRanker.FMODIFIED] + features[HitRanker.FACCESSED]) / 3; features[HitRanker.FFILENAME_DIRECT] = 0; features[HitRanker.FFILENAME_INDIRECT] = 0; final String fullPath = Path + Name; for (Term t : termSet) { if (fullPath.indexOf(t.text()) > 0) { features[HitRanker.FFILENAME_INDIRECT] = 1; break; } } features[HitRanker.FUSER_DIRECTORY] = 0; for (String dir : SystemDirs) { if (Path.indexOf(dir) > -1) { features[HitRanker.FUSER_DIRECTORY] = 1; break; } } } features[HitRanker.FHIGH_PRIORITY_TYPE] = DocMaker.HighPriorityTypes.contains(lowerExt) ? 1 : 0; features[HitRanker.FMED_PRIORITY_TYPE] = DocMaker.MedPriorityTypes.contains(lowerExt) ? 1 : 0; features[HitRanker.FLOW_PRIORITY_TYPE] = features[HitRanker.FHIGH_PRIORITY_TYPE] + features[HitRanker.FMED_PRIORITY_TYPE] > 0 ? 0 : 1; final Terms terms = rdr.getTermVector(LuceneID, "body"); final TermsEnum term = terms.iterator(null); double dotSum = 0, docVecSumSqrs = 0, numDims = 0, queryVecSumSqrs = 0; long termCount = 0; while (term.next() != null) { ++numDims; termCount = term.totalTermFreq(); docVecSumSqrs += termCount * termCount; if (termSet.contains(new Term("body", term.term()))) { dotSum += termCount; ++queryVecSumSqrs; ret.TermFreqs.put(BytesRef.deepCopyOf(term.term()), termCount); ret.MaxTermFreq = Math.max(ret.MaxTermFreq, termCount); // System.err.println(Path + Name + " contains term " + term.term().utf8ToString() + ", with freq " + termCount); } } features[HitRanker.FCOSINE_SIMILARITY] = dotSum / (Math.sqrt(docVecSumSqrs) + Math.sqrt(queryVecSumSqrs)); features[HitRanker.FTERM_CARDINALITY] = queryVecSumSqrs / termSet.size(); // features[HitRanker.FTERM_LENGTH] // features[HitRanker.FTERM_PRIORITY] = 0.0; return ret; }
From source file:Evaluator.TermFreq.java
public DocVector(String docid, IndexReader reader, IndexSearcher searcher, String fileName) throws IOException, ParseException { this.docid = docid; this.termVector = new ArrayList<TermFreq>(); Analyzer analyzer = new KeywordAnalyzer(); QueryParser parser = new QueryParser("id", analyzer); Query query = parser.parse(docid); TopDocs topdocs = searcher.search(query, 1); int index = topdocs.scoreDocs[0].doc; VectorExtractor ve = new VectorExtractor(); HashMap<String, ArrayList<Double>> h1 = ve.extractVector(fileName); vector = h1.get(docid);// ww w . j a va 2 s . c o m Terms terms = reader.getTermVector(index, "words"); TermsEnum termsEnum = null; termsEnum = terms.iterator(); BytesRef term; while ((term = termsEnum.next()) != null) { TermFreq tf = new TermFreq(term.utf8ToString(), (int) termsEnum.totalTermFreq()); this.termVector.add(tf); } }