List of usage examples for org.apache.lucene.index IndexReader docFreq
public abstract int docFreq(Term term) throws IOException;
term
. From source file:it.cnr.isti.hpc.dexter.lucene.LuceneHelper.java
License:Apache License
/** * Builds the TFIDF vector and its norm2 * /*from ww w . j av a 2s . co m*/ * @param tfidf * - the vector containing for each term its TFIDF score, it will * be populated by this method * @param freq * - the vector containing for each term its frequency * @param field * - the field on which to compute the inverse document frequency * * @return the norm of the TFIDF vector * */ private double tfidfVector(Map<String, Double> tfidf, Map<String, Integer> freq, String field) { IndexReader reader = getReader(); double norm = 0; for (Map.Entry<String, Integer> entry : freq.entrySet()) { Term t = new Term(field, entry.getKey()); int df = 0; try { df = reader.docFreq(t); } catch (IOException e) { logger.error("computing tfidfVector ({}) ", e.toString()); System.exit(-1); } double idf = Math.log(collectionSize / (double) df + 1) / Math.log(2) + 1; double tfidfValue = entry.getValue() * idf; norm += tfidfValue * tfidfValue; tfidf.put(entry.getKey(), tfidfValue); } return Math.sqrt(norm); }
From source file:it.unibz.instasearch.indexing.SearchResultDoc.java
License:Open Source License
private float[] createTermScoreVector(TermFreqVector vect, IndexReader reader) throws IOException { if (vect == null) return new float[0]; int[] termFrequencies = vect.getTermFrequencies(); String[] terms = vect.getTerms(); float[] scores = new float[terms.length]; int numDocs = reader.maxDoc(); Similarity sim = Searcher.SIMILARITY; for (int i = 0; i < terms.length; i++) { String termText = terms[i]; Term term = new Term(Field.CONTENTS.toString(), termText); float termFreq = sim.tf(termFrequencies[i]); int docFreq = reader.docFreq(term); float idf = sim.idf(docFreq, numDocs); float tfIdf = termFreq * idf; scores[i] = tfIdf;//from ww w . j a va 2s. c om } return scores; }
From source file:it.unipd.dei.ims.falcon.indexing.Indexing.java
License:Apache License
/** * Indexes all the songs in the specified path. * The index is created in the specified directory "indexPath". If an index * already exists in that path, adds the songs to the existing index. * Each song is processed by the method/* w ww. j a v a 2 s . c o m*/ * {@link it.unipd.dei.ims.falcon.indexing.Indexing#indexSong} * which maps the song into a set of segments, each of one is mapped in a * Lucene {@link org.apache.lucene.document.Document}. * The segments have fixed length, specifically are constituted by * "hashPerSegment" hashes. There can be an overlap of "hashInOverlap" * hashes between two segments. The number of hash in the overlap must be * smaller than the number of hash per segments, otherwise an * {@link it.unipd.dei.ims.falcon.indexing.IndexingException} is thrown. * <p> * Once the index has been created or updated, writes a map into a file. * The map associates a set of features to each hash. Those features are * based on occurrence statistics of the hash in the entire collection. * In the event of an index update the map is re-built and the map file * is over-written. * @param data Input file. If it is a directory, index all files inside it. * @param index Falcon index. * @param hashPerSegment Number of hashes per segment. * @param hashInOverlap Number of overlapping hashes per segment. * @throws IndexingException */ public static void index(File data, File index, final int hashPerSegment, final int hashInOverlap, final int subsampling, final int nranks, final double minkurtosis, final TranspositionEstimator transpEst, boolean verbose) throws IndexingException, IOException { long start_time = System.currentTimeMillis(); if (hashPerSegment <= hashInOverlap) throw new IndexingException( "Number of hashes in the overlap cannot be equal to the number of hash per segment"); if (!data.canRead()) throw new IOException("cannot read input path"); if (data.isDirectory()) { for (File f : data.listFiles()) if (!f.canRead()) throw new IOException("cannot read one or more input files"); } if (!index.exists()) // if index is being created rather than updated index.mkdir(); if (!index.canWrite()) throw new IOException("cannot write to index directory"); SimpleFSDirectory indexDir = new SimpleFSDirectory(index, null); // initialize Lucene Analyzer and IndexWriter Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30); final IndexWriter writer = new IndexWriter(indexDir, analyzer, !IndexReader.indexExists(indexDir), IndexWriter.MaxFieldLength.UNLIMITED); writer.setSimilarity(new HashSimilarity()); // transform chroma data into hashes and write into index File[] inputfiles = data.isDirectory() ? data.listFiles() : new File[] { data }; int fileNo = 0; for (final File file : inputfiles) { // if the current considered files exists and is not hidden if (file.exists() && !file.getName().startsWith(".")) { if (verbose) System.out.println(String.format("%10.3f%% - indexing %s", fileNo * 100. / inputfiles.length, file.getAbsolutePath())); final List<OutputStream> fout = new LinkedList<OutputStream>(); fout.add(new PipedOutputStream()); final PipedInputStream fin = new PipedInputStream((PipedOutputStream) fout.get(0)); Thread t = new Thread(new Runnable() { public void run() { try { ChromaMatrixUtils.convertChromaStreamIntoHashesStream(new FileReader(file), fout, nranks, transpEst, minkurtosis, subsampling); } catch (IOException ex) { // TODO do something better for this exception ... (might hang all ...) Logger.getLogger(Indexing.class.getName()).log(Level.SEVERE, null, ex); } } }); t.start(); indexSong(writer, fin, hashPerSegment, hashInOverlap, file.getAbsolutePath(), file.getAbsolutePath()); fileNo++; } } writer.optimize(); writer.close(); // additional falcon features PrintWriter pw = new PrintWriter(index.getAbsolutePath() + "/qpruning_features.map"); IndexReader reader = IndexReader.open(new SimpleFSDirectory(index)); int numSegments = reader.numDocs(); long total_hcf = numSegments * hashPerSegment; // total number of hashes in the collection TermEnum hashes = reader.terms(); // distinct hashes in the collection while (hashes.next()) { if (!hashes.term().field().equals("CONTENT")) { continue; } Term curHash = hashes.term(); pw.print(curHash.text() + "\t"); pw.print((double) reader.docFreq(curHash) / numSegments + "\t"); // normalized document frequency TermDocs curHash_pl = reader.termDocs(curHash); // posting list for the current hash // computation of the frequency of the current hash in the // entire collection -- value initialization long hcf = 0; // initializes the normalized maximum frequency value double nmf = 0; // initializes the normalized frequency for max computation double cur_nf = 0; // processes posting list entries while (curHash_pl.next()) { // computation of the normalized frequency for // the current hash cur_nf = (double) curHash_pl.freq() / hashPerSegment; // update max if necessary if (cur_nf > nmf) { nmf = cur_nf; } hcf += curHash_pl.freq(); } // prints normalized total collection frequency and // normalized maximum frequency for the current hash pw.print((double) hcf / total_hcf + "\t" + nmf + "\n"); } pw.flush(); pw.close(); long end_time = System.currentTimeMillis(); if (verbose) System.out.println(String.format("[INDEXING] - elapsed time: %10.3f", (end_time - start_time) / 1000.)); }
From source file:lucene.searchengine.LuceneSearchEngine.java
public static void main(String[] args) throws IOException { System.out.println(/*from w w w. j a va 2 s . c o m*/ "Enter the FULL path where the index will be created: (e.g. /Usr/index or c:\\temp\\index)"); String indexLocation = null; BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); String s = br.readLine(); LuceneSearchEngine indexer = null; try { indexLocation = s; indexer = new LuceneSearchEngine(s); } catch (Exception ex) { System.out.println("Cannot create index..." + ex.getMessage()); System.exit(-1); } // =================================================== // read input from user until he enters q for quit // =================================================== while (!s.equalsIgnoreCase("q")) { try { System.out.println( "Enter the FULL path to add into the index (q=quit): (e.g. /home/mydir/docs or c:\\Users\\mydir\\docs)"); System.out.println("[Acceptable file types: .xml, .html, .html, .txt]"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } // try to add file into the index indexer.indexFileOrDirectory(s); } catch (Exception e) { System.out.println("Error indexing " + s + " : " + e.getMessage()); } } // =================================================== // after adding, we always have to call the // closeIndex, otherwise the index is not created // =================================================== indexer.closeIndex(); // ========================================================= // Now search // ========================================================= IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); //=========================================================== // GET Term frequency //=========================================================== // Creating a output file to store the term,term_frequency pairs. PrintWriter tfwriter = new PrintWriter("..\\term-frequency.csv"); Fields fields = MultiFields.getFields(reader); HashMap<String, Long> tfmap = new HashMap<String, Long>(); Terms terms = fields.terms("contents"); TermsEnum termsEnum = terms.iterator(null); BytesRef bref = null; while ((bref = termsEnum.next()) != null) { String term_name = new String(bref.bytes, bref.offset, bref.length); Term term_instance = new Term("contents", term_name); long termFrequency = reader.totalTermFreq(term_instance); tfmap.put(term_name, termFrequency); } System.out.println(tfmap.size()); for (String key : tfmap.keySet()) { tfwriter.write(key + "," + tfmap.get(key)); tfwriter.write("\n"); } tfwriter.close(); //==================================================================== // Code END to fetch term frequency //==================================================================== IndexSearcher searcher = new IndexSearcher(reader); s = ""; while (!s.equalsIgnoreCase("q")) { TopScoreDocCollector collector = TopScoreDocCollector.create(100, true); try { System.out.println("Enter the search query (q=quit):"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } Query q = new QueryParser(Version.LUCENE_47, "contents", sAnalyzer).parse(s); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 4. display results System.out.println("Found " + hits.length + " hits."); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); System.out.println((i + 1) + ". " + d.get("filename") + " score=" + hits[i].score); } // 5. term stats --> watch out for which "version" of the term // must be checked here instead! Term termInstance = new Term("contents", s); long termFreq = reader.totalTermFreq(termInstance); long docCount = reader.docFreq(termInstance); System.out.println(s + " Term Frequency " + termFreq + " - Document Frequency " + docCount); } catch (Exception e) { System.out.println("Error searching " + s + " : " + e.getMessage()); break; } } }
From source file:lucenesearche.HW3.java
public static void main(String[] args) throws IOException { System.out.println(//from w ww.j a va 2s . co m "Enter the FULL path where the index will be created: (e.g. /Usr/index or c:\\temp\\index)"); String indexLocation = null; BufferedReader br = new BufferedReader(new InputStreamReader(System.in)); String s = br.readLine(); HW3 indexer = null; try { indexLocation = s; indexer = new HW3(s); } catch (Exception ex) { System.out.println("Cannot create index..." + ex.getMessage()); System.exit(-1); } // =================================================== // read input from user until he enters q for quit // =================================================== while (!s.equalsIgnoreCase("q")) { try { System.out.println( "Enter the FULL path to add into the index (q=quit): (e.g. /home/mydir/docs or c:\\Users\\mydir\\docs)"); System.out.println("[Acceptable file types: .xml, .html, .html, .txt]"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } // try to add file into the index indexer.indexFileOrDirectory(s); } catch (Exception e) { System.out.println("Error indexing " + s + " : " + e.getMessage()); } } // =================================================== // after adding, we always have to call the // closeIndex, otherwise the index is not created // =================================================== indexer.closeIndex(); // ========================================================= // Now search // ========================================================= IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(100, true); Formatter f = new Formatter(); s = ""; while (!s.equalsIgnoreCase("q")) { try { System.out.println("Enter the search query (q=quit):"); s = br.readLine(); if (s.equalsIgnoreCase("q")) { break; } Query q = new QueryParser(Version.LUCENE_47, "contents", sAnalyzer).parse(s); searcher.search(q, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 4. display results String query1, query2, query3, query4; query1 = "Lucene_Q1_top100.txt"; query2 = "Lucene_Q2_top100.txt"; query3 = "Lucene_Q3_top100.txt"; query4 = "Lucene_Q4_top100.txt"; File luceneFile = new File(query4); // change filename for each query int query_id; query_id = 4; // change this for new query luceneFile.createNewFile(); FileWriter writer = new FileWriter(luceneFile); writer.write(String.format("%-10s %-10s %-80s %-10s %-40s %-20s", "Query ID", "Q0", "Document Name", "Rank", "Cosine Similarity Score", "System Name\n")); System.out.println("Found " + hits.length + " hits."); System.out.println(f.format("%-10s %-10s %-80s %-10s %-40s %-20s", "Query ID", "Q0", "Document Name", "Rank", "Cosine Similarity Score", "System Name")); for (int i = 0; i < hits.length; ++i) { Formatter fmt = new Formatter(); int docId = hits[i].doc; Document d = searcher.doc(docId); //System.out.println((i+1) +". " + d.get("path")+" "+ hits[i].score); writer.write(String.format("%-10s %-10s %-80s %-10s %-40s %-20s", "" + query_id, "Q0", "" + d.get("path"), "" + (i + 1), "" + hits[i].score, "Shantanu-SYS-001\n")); writer.flush(); System.out.println(fmt.format("%-10s %-10s %-80s %-10s %-40s %-20s", "" + query_id, "Q0", "" + d.get("path"), "" + (i + 1), "" + hits[i].score, "Shantanu-SYS-001")); } writer.close(); // 5. term stats --> watch out for which "version" of the term // must be checked here instead! Term termInstance = new Term("contents", s); long termFreq = reader.totalTermFreq(termInstance); long docCount = reader.docFreq(termInstance); System.out.println(s + " Term Frequency " + termFreq + " - Document Frequency " + docCount); } catch (Exception e) { System.out.println("Error searching " + s + " : " + e.getMessage()); break; } } }
From source file:magoffin.matt.lucene.LuceneSearchService.java
License:Open Source License
/** * Delete a Document from the index.// w w w . j a v a 2 s . com * * <p>Check out <a * href="http://nagoya.apache.org/eyebrowse/ReadMsg?listName=lucene-user@jakarta.apache.org&msgId=1190557" * >this post</a> for info on how this is done. * </p> * * @param type the index type * @param reader the index to delete from * @param id the ID of the Document to delete, using the <code>idField</code> field * @return the number of items deleted */ protected int deleteFromIndex(String type, IndexReader reader, Object id) { if (id == null) { throw new IllegalArgumentException("Null ID passed to deleteFromIndex"); } try { Term idTerm = new Term(idField, id.toString()); if (reader.docFreq(idTerm) > 0) { int result = reader.deleteDocuments(idTerm); if (traceLog.isInfoEnabled()) { traceLog.info(TraceOp.DELETE + "Deleted " + result + " Document for ID " + id + " from reader " + reader + " (" + reader.directory().toString() + ")"); } LuceneServiceUtils.publishIndexEvent(new IndexEvent(id, EventType.DELETE, type), this.indexEventListeners); return result; } } catch (IOException e) { throw new RuntimeException("IOException deleting Document from Lucene index", e); } return 0; }
From source file:net.dataninja.ee.textEngine.MoreLikeThisQuery.java
License:Apache License
/** * Generate a query that will produce "more documents like" the first * in the sub-query.//from w w w. jav a2 s . c o m */ public Query rewrite(IndexReader reader) throws IOException { // If field boosts were specified, make sure there are the same number of // boosts as there are fields. // if (fieldBoosts != null && fieldBoosts.length != fieldNames.length) throw new RuntimeException( "Error: different number of boosts than fields specified to MoreLikeThisQuery"); // Determine the target document. IndexSearcher searcher = new IndexSearcher(reader); targetDoc = -1; HitCollector collector = new HitCollector() { public void collect(int doc, float score) { if (targetDoc < 0) targetDoc = doc; } }; searcher.search(subQuery, collector); // If none, make a query that will definitely return nothing at all. if (targetDoc < 0) return new TermQuery(new Term("fribbleSnarf", "!*@&#(*&")); // Eliminate fields with zero boost. Along the way, make a boost map so we // have fast access to the boost per field. // String[] fields = this.fieldNames; if (fieldBoosts != null) { ArrayList filteredFields = new ArrayList(); for (int i = 0; i < fieldNames.length; i++) { if (fieldBoosts[i] > 0.0f) { filteredFields.add(fieldNames[i]); boostMap.put(fieldNames[i], new Float(fieldBoosts[i])); } } fields = (String[]) filteredFields.toArray(new String[filteredFields.size()]); } // If we've been asked to calculate the max document frequency, do it now. if (maxDocFreq < 0) { int nDocs = reader.docFreq(new Term("docInfo", "1")); maxDocFreq = Math.max(5, nDocs / 20); } // Add facet fields, if any. For now, spot them by name. XTFTextAnalyzer analyzer = new XTFTextAnalyzer(null, pluralMap, accentMap); for (int i = 0; i < fields.length; i++) { if (fields[i].indexOf("facet") >= 0) analyzer.addFacetField(fields[i]); } // Determine which terms are "best" for querying. PriorityQueue bestTerms = retrieveTerms(reader, targetDoc, analyzer); // Make the "more like this" query from those terms. Query rawQuery = createQuery(reader, bestTerms); // Exclude the original document in the result set. Query ret = new MoreLikeWrapper(this, rawQuery); if (Trace.getOutputLevel() >= Trace.debug) Trace.debug("More-like query: " + ret); return ret; }
From source file:net.dataninja.ee.textEngine.MoreLikeThisQuery.java
License:Apache License
/** * Create the More like query from a PriorityQueue *///from w w w. j a v a 2 s . c o m private Query createQuery(IndexReader indexReader, PriorityQueue q) throws IOException { // Pop everything from the queue. QueryWord[] queryWords = new QueryWord[q.size()]; for (int i = q.size() - 1; i >= 0; i--) queryWords[i] = (QueryWord) q.pop(); BooleanQuery query = new BooleanQuery(true /*disable coord*/); // At the moment, there's no need to scale by the best score. It simply // clouds the query explanation. It doesn't affect the scores, since // Lucene applies a query normalization factor anyway. // //float bestScore = (queryWords.length > 0) ? queryWords[0].score : 0.0f; for (int i = 0; i < fieldNames.length; i++) { ArrayList fieldClauses = new ArrayList(); for (int j = 0; j < queryWords.length; j++) { QueryWord qw = queryWords[j]; Term term = new Term(fieldNames[i], qw.word); // Skip words not present in this field. int docFreq = indexReader.docFreq(term); if (docFreq == 0) continue; // Add it to the query. SpanTermQuery tq = new SpanTermQuery(term); if (boost) tq.setBoost(qw.score); fieldClauses.add(tq); } // for j // If no terms for this field, skip it. if (fieldClauses.isEmpty()) continue; SpanQuery[] clauses = (SpanQuery[]) fieldClauses.toArray(new SpanQuery[fieldClauses.size()]); // Now make a special Or-Near query out of the clauses. SpanOrNearQuery fieldQuery = new SpanOrNearQuery(clauses, 10, false); // Boost if necessary. if (fieldBoosts != null) fieldQuery.setBoost(fieldBoosts[i]); // We currently don't support more-like-this queries on the full text. // It would involve de-chunking, and also fancier logic to pick the // "most interesting" terms in the first place. // if (fieldNames[i].equals("text")) throw new RuntimeException("MoreLikeThisQuery does not support 'text' field."); // And add to the main query. query.add(fieldQuery, BooleanClause.Occur.SHOULD); } // for i // All done. return query; }
From source file:net.dataninja.ee.textEngine.MoreLikeThisQuery.java
License:Apache License
/** * Condense the same term in multiple fields into a single term with a * total score./*from w ww . ja v a2 s. c o m*/ * * @param words a map of words keyed on the word(String) with Int objects as the values. */ private Map condenseTerms(IndexReader indexReader, Map words) throws IOException { HashMap termScoreMap = new HashMap(); // For reference in score calculations, get the total # of docs in index int numDocs = indexReader.numDocs(); // For each term... Iterator it = words.keySet().iterator(); while (it.hasNext()) { Term term = (Term) it.next(); // Filter out words that don't occur enough times in the source doc int tf = ((Int) words.get(term)).x; if (minTermFreq > 0 && tf < minTermFreq) continue; // Filter out words that don't occur in enough docs int docFreq = indexReader.docFreq(term); if (minDocFreq > 0 && docFreq < minDocFreq) continue; // Filter out words that occur in too many docs if (maxDocFreq > 0 && docFreq > maxDocFreq) continue; // Handle potential index update problem if (docFreq == 0) continue; // Calculate a score for this term. float idf = similarity.idf(docFreq, numDocs); float score = tf * idf; // Boost if necessary. Float found = (Float) boostMap.get(term.field()); if (found != null) score *= found.floatValue(); // Add the score to our map. String word = term.text(); if (!termScoreMap.containsKey(word)) termScoreMap.put(word, new Flt()); Flt cnt = (Flt) termScoreMap.get(word); cnt.x += score; } return termScoreMap; }
From source file:net.sf.jtmt.summarizers.LuceneSummarizer.java
License:Apache License
/** * Compute top term query./*from www. ja v a 2 s . co m*/ * * @param ramdir the ramdir * @return the query * @throws Exception the exception */ private Query computeTopTermQuery(Directory ramdir) throws Exception { final Map<String, Integer> frequencyMap = new HashMap<String, Integer>(); List<String> termlist = new ArrayList<String>(); IndexReader reader = IndexReader.open(ramdir, true); TermEnum terms = reader.terms(); while (terms.next()) { Term term = terms.term(); String termText = term.text(); int frequency = reader.docFreq(term); frequencyMap.put(termText, frequency); termlist.add(termText); } reader.close(); // sort the term map by frequency descending Collections.sort(termlist, new ReverseComparator<String>(new ByValueComparator<String, Integer>(frequencyMap))); // retrieve the top terms based on topTermCutoff List<String> topTerms = new ArrayList<String>(); float topFreq = -1.0F; for (String term : termlist) { if (topFreq < 0.0F) { // first term, capture the value topFreq = (float) frequencyMap.get(term); topTerms.add(term); } else { // not the first term, compute the ratio and discard if below // topTermCutoff score float ratio = (float) ((float) frequencyMap.get(term) / topFreq); if (ratio >= topTermCutoff) { topTerms.add(term); } else { break; } } } StringBuilder termBuf = new StringBuilder(); BooleanQuery q = new BooleanQuery(); for (String topTerm : topTerms) { termBuf.append(topTerm).append("(").append(frequencyMap.get(topTerm)).append(");"); q.add(new TermQuery(new Term("text", topTerm)), Occur.SHOULD); } System.out.println(">>> top terms: " + termBuf.toString()); System.out.println(">>> query: " + q.toString()); return q; }