List of usage examples for org.apache.lucene.index IndexReader numDocs
public abstract int numDocs();
From source file:org.apache.nutch.indexer.TestIndexSorter.java
License:Apache License
public void testSorting() throws Exception { IndexSorter sorter = new IndexSorter(conf); sorter.sort(testDir);//from www . j a v a 2 s. co m // read back documents IndexReader reader = IndexReader.open(FSDirectory.open(new File(testDir, INDEX_SORTED))); assertEquals(reader.numDocs(), NUM_DOCS); for (int i = 0; i < reader.maxDoc(); i++) { Document doc = reader.document(i); Field f = doc.getField("content"); assertNull(f); f = doc.getField("boost"); float boost = Similarity.decodeNorm((byte) (NUM_DOCS - i)); String cmp = String.valueOf(boost); assertEquals(cmp, f.stringValue()); } reader.close(); }
From source file:org.apache.nutch.spell.NGramSpeller.java
License:Apache License
/** * Main driver, used to build an index. You probably want invoke like this: * <br>/*ww w . jav a 2s. c o m*/ * <code> * java org.apache.lucene.spell.NGramSpeller -f contents -i orig_index -o ngram_index * </code> */ public static void main(String[] args) throws Throwable { int minThreshold = 5; int ng1 = 3; int ng2 = 4; int maxr = 10; int maxd = 5; String out = "gram_index"; String gi = "gram_index"; String name = null; String field = "contents"; for (int i = 0; i < args.length; i++) { if (args[i].equals("-i")) { name = args[++i]; } else if (args[i].equals("-minThreshold")) { minThreshold = Integer.parseInt(args[++i]); } else if (args[i].equals("-gi")) { gi = args[++i]; } else if (args[i].equals("-o")) { out = args[++i]; } else if (args[i].equals("-t")) { // test transpositions String s = args[++i]; o.println("TRANS: " + s); String[] ar = formTranspositions(s); for (int j = 0; j < ar.length; j++) o.println("\t" + ar[j]); System.exit(0); } else if (args[i].equals("-ng1")) { ng1 = Integer.parseInt(args[++i]); } else if (args[i].equals("-ng2")) { ng2 = Integer.parseInt(args[++i]); } else if (args[i].equals("-help") || args[i].equals("--help") || args[i].equals("-h")) { o.println("To form an ngram index:"); o.println("NGramSpeller -i ORIG_INDEX -o NGRAM_INDEX [-ng1 MIN] [-ng2 MAX] [-f FIELD]"); o.println("Defaults are ng1=3, ng2=4, field='contents'"); System.exit(100); } else if (args[i].equals("-q")) { String goal = args[++i]; o.println("[NGrams] for " + goal + " from " + gi); float bStart = 2.0f; float bEnd = 1.0f; float bTransposition = 0f; o.println("bStart: " + bStart); o.println("bEnd: " + bEnd); o.println("bTrans: " + bTransposition); o.println("ng1: " + ng1); o.println("ng2: " + ng2); IndexReader ir = IndexReader.open(gi); IndexSearcher searcher = new IndexSearcher(gi); List lis = new ArrayList(maxr); String[] res = suggestUsingNGrams(searcher, goal, ng1, ng2, maxr, bStart, bEnd, bTransposition, maxd, lis, true); // more popular o.println("Returned " + res.length + " from " + gi + " which has " + ir.numDocs() + " words in it"); Iterator it = lis.iterator(); while (it.hasNext()) { o.println(it.next().toString()); } o.println(); o.println("query: " + lastQuery.toString("contents")); Hits ghits = searcher.search(new TermQuery(new Term(F_WORD, "recursive"))); if (ghits.length() >= 1) // umm, should only be 0 or 1 { Document doc = ghits.doc(0); o.println("TEST DOC: " + doc); } searcher.close(); ir.close(); return; } else if (args[i].equals("-f")) { field = args[++i]; } else { o.println("hmm? " + args[i]); System.exit(1); } } if (name == null) { o.println("opps, you need to specify the input index w/ -i"); System.exit(1); } o.println("Opening " + name); IndexReader.unlock(FSDirectory.getDirectory(name, false)); final IndexReader r = IndexReader.open(name); o.println("Docs: " + nf.format(r.numDocs())); o.println("Using field: " + field); IndexWriter writer = new IndexWriter(out, new WhitespaceAnalyzer(), true); writer.setMergeFactor(writer.getMergeFactor() * 50); writer.setMaxBufferedDocs(writer.getMaxBufferedDocs() * 50); o.println("Forming index from " + name + " to " + out); int res = formNGramIndex(r, writer, ng1, ng2, field, minThreshold); o.println("done, did " + res + " ngrams"); writer.optimize(); writer.close(); r.close(); }
From source file:org.apache.nutch.spell.NGramSpeller.java
License:Apache License
/** * Go thru all terms and form an index of the "ngrams" of length 'ng1' to * 'ng2' in each term. The ngrams have field names like "gram3" for a 3 char * ngram, and "gram4" for a 4 char one. The starting and ending (or prefix and * suffix) "n" characters are also stored for each word with field names * "start3" and "end3"./*from w w w.j ava 2 s . co m*/ * * * @param r * the index to read terms from * * @param w * the writer to write the ngrams to, or if null an index named * "gram_index" will be created. If you pass in non-null then you * should optimize and close the index. * * @param ng1 * the min number of chars to form ngrams with (3 is suggested) * * @param ng2 * the max number of chars to form ngrams with, can be equal to ng1 * * @param fields * the field name to process ngrams from. * * @param minThreshold * terms must appear in at least this many docs else they're ignored * as the assumption is that they're so rare (...) * * @return the number of ngrams added * */ private static int formNGramIndex(IndexReader r, IndexWriter _w, int ng1, int ng2, String field, int minThreshold) throws IOException { int mins = 0; float nudge = 0.01f; // don't allow boosts to be too small IndexWriter w; if (_w == null) { w = new IndexWriter("gram_index", new WhitespaceAnalyzer(), // should have // no effect true); } else { w = _w; } int mod = 1000; // for status int nd = r.numDocs(); final float base = (float) Math.log(1.0d / ((double) nd)); if (field == null) { field = "contents"; // def field } field = field.intern(); // is it doced that you can use == on fields? int grams = 0; // # of ngrams added final TermEnum te = r.terms(new Term(field, "")); int n = 0; int skips = 0; while (te.next()) { boolean show = false; // for debugging Term t = te.term(); String have = t.field(); if ((have != field) && !have.equals(field)) // wrong field { break; } if (t.text().indexOf('-') >= 0) { continue; } int df = te.docFreq(); if ((++n % mod) == 0) { show = true; o.println("term: " + t + " n=" + nf.format(n) + " grams=" + nf.format(grams) + " mins=" + nf.format(mins) + " skip=" + nf.format(skips) + " docFreq=" + df); } if (df < minThreshold) // not freq enough, too rare to consider { mins++; continue; } String text = t.text(); int len = text.length(); if (len < ng1) { continue; // too short we bail but "too long" is fine... } // but note that long tokens that are rare prob won't get here anyway as // they won't // pass the 'minThreshold' check above Document doc = new Document(); doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term doc.add(new Field(F_FREQ, "" + df, Field.Store.YES, Field.Index.UN_TOKENIZED)); // for popularity cutoff optionx String[] trans = formTranspositions(text); for (int i = 0; i < trans.length; i++) doc.add(new Field(F_TRANSPOSITION, trans[i], Field.Store.YES, Field.Index.UN_TOKENIZED)); // now loop thru all ngrams of lengths 'ng1' to 'ng2' for (int ng = ng1; ng <= ng2; ng++) { String key = "gram" + ng; String end = null; for (int i = 0; i < (len - ng + 1); i++) { String gram = text.substring(i, i + ng); doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED)); if (i == 0) { doc.add(new Field("start" + ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED)); } end = gram; grams++; } if (end != null) { // may not be present if len==ng1 doc.add(new Field("end" + ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED)); } } float f1 = te.docFreq(); float f2 = nd; float bo = (float) ((Math.log(f1) / Math.log(f2)) + nudge); doc.setBoost(bo); if (show) { o.println("f1=" + f1 + " nd=" + nd + " boost=" + bo + " base=" + base + " word=" + text); } w.addDocument(doc); } if (_w == null) // else you have to optimize/close { w.optimize(); w.close(); } return grams; }
From source file:org.apache.solr.codecs.test.testONSQLCodec.java
License:Apache License
public static void main(String[] args) { try {/* www .jav a 2 s . c o m*/ plaintextDir = assureDirectoryExists(new File(INDEX_ROOT_FOLDER)); testUtil.initPropsONSQL(); //----------- index documents ------- StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_10_1); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_1, analyzer); // recreate the index on each execution config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); //config.setCodec(new SimpleTextCodec()); ONSQLCodec codec = new ONSQLCodec(); config.setCodec(codec); config.setUseCompoundFile(false); Directory luceneDir = FSDirectory.open(plaintextDir); IndexWriter writer = new IndexWriter(luceneDir, config); writer.addDocument(Arrays.asList(new TextField("title", "The title of my first document", Store.YES), new TextField("content", "The content of the first document", Store.YES), new IntField("intval", 111111, Store.YES), new LongField("longval", 1111111111L, Store.YES))); writer.addDocument(Arrays.asList(new TextField("title", "The tAtle of the second document", Store.YES), new TextField("content", "The content of the second document", Store.YES), new IntField("intval", 222222, Store.YES), new LongField("longval", 222222222L, Store.YES))); writer.addDocument(Arrays.asList(new TextField("title", "The title of the third document", Store.YES), new TextField("content", "The content of the third document", Store.YES), new IntField("intval", 333333, Store.YES), new LongField("longval", 3333333333L, Store.YES))); writer.commit(); writer.close(); IndexReader reader = DirectoryReader.open(luceneDir); // now test for docs if (reader.numDocs() < 3) throw new IOException("amount of returned docs are less than indexed"); else System.out.println("test passed"); searchIndex("content", "third"); } catch (Throwable te) { te.printStackTrace(); } }
From source file:org.apache.solr.codecs.test.testSimpleTextCodec.java
License:Apache License
public static void main(String[] args) { try {/*from w ww.j a va 2 s.c o m*/ plaintextDir = assureDirectoryExists(new File(INDEX_ROOT_FOLDER, "plaintext")); //----------- index documents ------- StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_48); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer); // recreate the index on each execution config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); config.setCodec(new SimpleTextCodec()); config.setUseCompoundFile(false); Directory luceneDir = FSDirectory.open(plaintextDir); IndexWriter writer = new IndexWriter(luceneDir, config); writer.addDocument(Arrays.asList(new TextField("title", "The title of my first document", Store.YES), new TextField("content", "The content of the first document", Store.YES))); writer.addDocument(Arrays.asList(new TextField("title", "The tAtle of the second document", Store.YES), new TextField("content", "The content of the second document", Store.YES))); writer.addDocument(Arrays.asList(new TextField("title", "The title of the third document", Store.YES), new TextField("content", "The content of the third document", Store.YES))); writer.commit(); writer.close(); IndexReader reader = DirectoryReader.open(luceneDir); // now test for docs if (reader.numDocs() != 3) throw new IOException("amount of returned docs are less than indexed"); else System.out.println("test passed"); searchIndex("content", "third"); } catch (Throwable te) { te.printStackTrace(); } }
From source file:org.apache.solr.handler.SpellCheckerRequestHandler.java
License:Apache License
/** * Processes the following query string parameters: q, extendedResults, cmd rebuild, * cmd reopen, accuracy, suggestionCount, restrictToField, and onlyMorePopular. *//*from w w w .j a va 2 s . c o m*/ @Override public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception { SolrParams p = req.getParams(); String words = p.get("q"); String cmd = p.get("cmd"); if (cmd != null) { cmd = cmd.trim(); if (cmd.equals("rebuild")) { rebuild(req); rsp.add("cmdExecuted", "rebuild"); } else if (cmd.equals("reopen")) { reopen(); rsp.add("cmdExecuted", "reopen"); } else { throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unrecognized Command: " + cmd); } } // empty query string if (null == words || "".equals(words.trim())) { return; } IndexReader indexReader = null; String suggestionField = null; Float accuracy; int numSug; boolean onlyMorePopular; boolean extendedResults; try { accuracy = p.getFloat(ACCURACY, p.getFloat("accuracy", DEFAULT_ACCURACY)); spellChecker.setAccuracy(accuracy); } catch (NumberFormatException e) { throw new RuntimeException("Accuracy must be a valid positive float", e); } try { numSug = p.getInt(SUGGESTIONS, p.getInt("suggestionCount", DEFAULT_SUGGESTION_COUNT)); } catch (NumberFormatException e) { throw new RuntimeException("Spelling suggestion count must be a valid positive integer", e); } try { onlyMorePopular = p.getBool(POPULAR, DEFAULT_MORE_POPULAR); } catch (SolrException e) { throw new RuntimeException("'Only more popular' must be a valid boolean", e); } try { extendedResults = p.getBool(EXTENDED, DEFAULT_EXTENDED_RESULTS); } catch (SolrException e) { throw new RuntimeException("'Extended results' must be a valid boolean", e); } // when searching for more popular, a non null index-reader and // restricted-field are required if (onlyMorePopular || extendedResults) { indexReader = req.getSearcher().getReader(); suggestionField = termSourceField; } if (extendedResults) { rsp.add("numDocs", indexReader.numDocs()); SimpleOrderedMap<Object> results = new SimpleOrderedMap<Object>(); String[] wordz = words.split(" "); for (String word : wordz) { SimpleOrderedMap<Object> nl = new SimpleOrderedMap<Object>(); nl.add("frequency", indexReader.docFreq(new Term(suggestionField, word))); String[] suggestions = spellChecker.suggestSimilar(word, numSug, indexReader, suggestionField, onlyMorePopular); // suggestion array NamedList<Object> sa = new NamedList<Object>(); for (int i = 0; i < suggestions.length; i++) { // suggestion item SimpleOrderedMap<Object> si = new SimpleOrderedMap<Object>(); si.add("frequency", indexReader.docFreq(new Term(termSourceField, suggestions[i]))); sa.add(suggestions[i], si); } nl.add("suggestions", sa); results.add(word, nl); } rsp.add("result", results); } else { rsp.add("words", words); if (spellChecker.exist(words)) { rsp.add("exist", "true"); } else { rsp.add("exist", "false"); } String[] suggestions = spellChecker.suggestSimilar(words, numSug, indexReader, suggestionField, onlyMorePopular); rsp.add("suggestions", Arrays.asList(suggestions)); } }
From source file:org.archive.nutchwax.tools.DateAdder.java
License:LGPL
public int run(String[] args) throws Exception { if (args.length < 4) { System.out.println("DateAdder <key-index> <source1> ... <sourceN> <dest> <records>"); System.exit(0);//from w ww .ja va 2s. c o m } String mainIndexDir = args[0].trim(); String destIndexDir = args[args.length - 2].trim(); String recordsFile = args[args.length - 1].trim(); InputStream recordsStream; if ("-".equals(recordsFile)) { recordsStream = System.in; } else { recordsStream = new FileInputStream(recordsFile); } // Read date-addition records from stdin. Map<String, String> dateRecords = new HashMap<String, String>(); BufferedReader br = new BufferedReader(new InputStreamReader(recordsStream, "UTF-8")); String line; while ((line = br.readLine()) != null) { String fields[] = line.split("\\s+"); if (fields.length < 3) { System.out.println("Malformed line, not enough fields (" + fields.length + "): " + line); continue; } // Key is hash+url, value is String which is a " "-separated list of dates String key = fields[0] + fields[1]; String dates = dateRecords.get(key); if (dates != null) { dates += " " + fields[2]; dateRecords.put(key, dates); } else { dateRecords.put(key, fields[2]); } } IndexReader reader = IndexReader.open(mainIndexDir); IndexReader sourceReaders[] = new IndexReader[args.length - 3]; for (int i = 0; i < sourceReaders.length; i++) { sourceReaders[i] = IndexReader.open(args[i + 1]); } IndexWriter writer = new IndexWriter(destIndexDir, new WhitespaceAnalyzer(), true); UrlCanonicalizer canonicalizer = getCanonicalizer(this.getConf()); for (int i = 0; i < reader.numDocs(); i++) { Document oldDoc = reader.document(i); Document newDoc = new Document(); // Copy the values from all the source indices to the new // document. Set<String> uniqueDates = new HashSet<String>(); for (IndexReader source : sourceReaders) { Document sourceDoc = source.document(i); String dates[] = sourceDoc.getValues(NutchWax.DATE_KEY); Collections.addAll(uniqueDates, dates); } for (String date : uniqueDates) { newDoc.add(new Field(NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED)); } // Obtain the new dates for the document. String newDates = null; try { // First, apply URL canonicalization from Wayback String canonicalizedUrl = canonicalizer.urlStringToKey(oldDoc.get(NutchWax.URL_KEY)); // Now, get the digest+URL of the document, look for it in // the updateRecords and if found, add the date. String key = canonicalizedUrl + oldDoc.get(NutchWax.DIGEST_KEY); newDates = dateRecords.get(key); } catch (Exception e) { // The canonicalizer can throw various types of exceptions // due to malformed URIs. System.err.println("WARN: Not adding dates on malformed URI: " + oldDoc.get(NutchWax.URL_KEY)); } // If there are any new dates, add them to the new document. if (newDates != null) { for (String date : newDates.split("\\s+")) { newDoc.add(new Field(NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED)); } } // Finally, add the new document to the new index. writer.addDocument(newDoc); } reader.close(); writer.close(); return 0; }
From source file:org.archive.nutchwax.tools.DumpParallelIndex.java
License:LGPL
private static void dumpIndex(IndexReader reader, String fieldName) throws Exception { Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL); if (!fieldNames.contains(fieldName)) { System.out.println("Field not in index: " + fieldName); System.exit(2);/*www.j a v a 2 s . com*/ } int numDocs = reader.numDocs(); for (int i = 0; i < numDocs; i++) { System.out.println(Arrays.toString(reader.document(i).getValues((String) fieldName))); } }
From source file:org.archive.nutchwax.tools.DumpParallelIndex.java
License:LGPL
private static void dumpIndex(IndexReader reader) throws Exception { Object[] fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL).toArray(); Arrays.sort(fieldNames);/*from w w w. jav a 2s.c o m*/ for (int i = 0; i < fieldNames.length; i++) { System.out.print(fieldNames[i] + "\t"); } System.out.println(); int numDocs = reader.numDocs(); for (int i = 0; i < numDocs; i++) { for (int j = 0; j < fieldNames.length; j++) { System.out.print(Arrays.toString(reader.document(i).getValues((String) fieldNames[j])) + "\t"); } System.out.println(); } }
From source file:org.archive.nutchwax.tools.DumpParallelIndex.java
License:LGPL
private static void countDocs(IndexReader reader) throws Exception { System.out.println(reader.numDocs()); }