Example usage for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:org.apache.nutch.indexer.TestIndexSorter.java

License:Apache License

public void testSorting() throws Exception {
    IndexSorter sorter = new IndexSorter(conf);
    sorter.sort(testDir);//from   www .  j  a v a 2  s. co m

    // read back documents
    IndexReader reader = IndexReader.open(FSDirectory.open(new File(testDir, INDEX_SORTED)));
    assertEquals(reader.numDocs(), NUM_DOCS);
    for (int i = 0; i < reader.maxDoc(); i++) {
        Document doc = reader.document(i);
        Field f = doc.getField("content");
        assertNull(f);
        f = doc.getField("boost");
        float boost = Similarity.decodeNorm((byte) (NUM_DOCS - i));
        String cmp = String.valueOf(boost);
        assertEquals(cmp, f.stringValue());
    }
    reader.close();
}

From source file:org.apache.nutch.spell.NGramSpeller.java

License:Apache License

/**
 * Main driver, used to build an index. You probably want invoke like this:
 * <br>/*ww  w  . jav  a 2s.  c o m*/
 * <code>
 * java org.apache.lucene.spell.NGramSpeller -f contents -i orig_index -o ngram_index
 * </code>
 */
public static void main(String[] args) throws Throwable {
    int minThreshold = 5;
    int ng1 = 3;
    int ng2 = 4;
    int maxr = 10;
    int maxd = 5;
    String out = "gram_index";
    String gi = "gram_index";

    String name = null;
    String field = "contents";

    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-i")) {
            name = args[++i];
        } else if (args[i].equals("-minThreshold")) {
            minThreshold = Integer.parseInt(args[++i]);
        } else if (args[i].equals("-gi")) {
            gi = args[++i];
        } else if (args[i].equals("-o")) {
            out = args[++i];
        } else if (args[i].equals("-t")) { // test transpositions

            String s = args[++i];
            o.println("TRANS: " + s);

            String[] ar = formTranspositions(s);

            for (int j = 0; j < ar.length; j++)
                o.println("\t" + ar[j]);

            System.exit(0);
        } else if (args[i].equals("-ng1")) {
            ng1 = Integer.parseInt(args[++i]);
        } else if (args[i].equals("-ng2")) {
            ng2 = Integer.parseInt(args[++i]);
        } else if (args[i].equals("-help") || args[i].equals("--help") || args[i].equals("-h")) {
            o.println("To form an ngram index:");
            o.println("NGramSpeller -i ORIG_INDEX -o NGRAM_INDEX [-ng1 MIN] [-ng2 MAX] [-f FIELD]");
            o.println("Defaults are ng1=3, ng2=4, field='contents'");
            System.exit(100);
        } else if (args[i].equals("-q")) {
            String goal = args[++i];
            o.println("[NGrams] for " + goal + " from " + gi);

            float bStart = 2.0f;
            float bEnd = 1.0f;
            float bTransposition = 0f;

            o.println("bStart: " + bStart);
            o.println("bEnd: " + bEnd);
            o.println("bTrans: " + bTransposition);
            o.println("ng1: " + ng1);
            o.println("ng2: " + ng2);

            IndexReader ir = IndexReader.open(gi);
            IndexSearcher searcher = new IndexSearcher(gi);
            List lis = new ArrayList(maxr);
            String[] res = suggestUsingNGrams(searcher, goal, ng1, ng2, maxr, bStart, bEnd, bTransposition,
                    maxd, lis, true); // more popular
            o.println("Returned " + res.length + " from " + gi + " which has " + ir.numDocs() + " words in it");

            Iterator it = lis.iterator();

            while (it.hasNext()) {
                o.println(it.next().toString());
            }

            o.println();
            o.println("query: " + lastQuery.toString("contents"));

            Hits ghits = searcher.search(new TermQuery(new Term(F_WORD, "recursive")));

            if (ghits.length() >= 1) // umm, should only be 0 or 1
            {
                Document doc = ghits.doc(0);
                o.println("TEST DOC: " + doc);
            }

            searcher.close();
            ir.close();

            return;
        } else if (args[i].equals("-f")) {
            field = args[++i];
        } else {
            o.println("hmm? " + args[i]);
            System.exit(1);
        }
    }

    if (name == null) {
        o.println("opps, you need to specify the input index w/ -i");
        System.exit(1);
    }

    o.println("Opening " + name);
    IndexReader.unlock(FSDirectory.getDirectory(name, false));

    final IndexReader r = IndexReader.open(name);

    o.println("Docs: " + nf.format(r.numDocs()));
    o.println("Using field: " + field);

    IndexWriter writer = new IndexWriter(out, new WhitespaceAnalyzer(), true);
    writer.setMergeFactor(writer.getMergeFactor() * 50);
    writer.setMaxBufferedDocs(writer.getMaxBufferedDocs() * 50);

    o.println("Forming index from " + name + " to " + out);

    int res = formNGramIndex(r, writer, ng1, ng2, field, minThreshold);

    o.println("done, did " + res + " ngrams");
    writer.optimize();
    writer.close();
    r.close();
}

From source file:org.apache.nutch.spell.NGramSpeller.java

License:Apache License

/**
 * Go thru all terms and form an index of the "ngrams" of length 'ng1' to
 * 'ng2' in each term. The ngrams have field names like "gram3" for a 3 char
 * ngram, and "gram4" for a 4 char one. The starting and ending (or prefix and
 * suffix) "n" characters are also stored for each word with field names
 * "start3" and "end3"./*from w  w w.j ava 2  s  . co  m*/
 * 
 * 
 * @param r
 *          the index to read terms from
 * 
 * @param w
 *          the writer to write the ngrams to, or if null an index named
 *          "gram_index" will be created. If you pass in non-null then you
 *          should optimize and close the index.
 * 
 * @param ng1
 *          the min number of chars to form ngrams with (3 is suggested)
 * 
 * @param ng2
 *          the max number of chars to form ngrams with, can be equal to ng1
 * 
 * @param fields
 *          the field name to process ngrams from.
 * 
 * @param minThreshold
 *          terms must appear in at least this many docs else they're ignored
 *          as the assumption is that they're so rare (...)
 * 
 * @return the number of ngrams added
 * 
 */
private static int formNGramIndex(IndexReader r, IndexWriter _w, int ng1, int ng2, String field,
        int minThreshold) throws IOException {
    int mins = 0;
    float nudge = 0.01f; // don't allow boosts to be too small
    IndexWriter w;

    if (_w == null) {
        w = new IndexWriter("gram_index", new WhitespaceAnalyzer(), // should have
                // no effect
                true);
    } else {
        w = _w;
    }

    int mod = 1000; // for status
    int nd = r.numDocs();
    final float base = (float) Math.log(1.0d / ((double) nd));

    if (field == null) {
        field = "contents"; // def field
    }

    field = field.intern(); // is it doced that you can use == on fields?

    int grams = 0; // # of ngrams added
    final TermEnum te = r.terms(new Term(field, ""));
    int n = 0;
    int skips = 0;

    while (te.next()) {
        boolean show = false; // for debugging
        Term t = te.term();
        String have = t.field();

        if ((have != field) && !have.equals(field)) // wrong field
        {
            break;
        }

        if (t.text().indexOf('-') >= 0) {
            continue;
        }

        int df = te.docFreq();

        if ((++n % mod) == 0) {
            show = true;
            o.println("term: " + t + " n=" + nf.format(n) + " grams=" + nf.format(grams) + " mins="
                    + nf.format(mins) + " skip=" + nf.format(skips) + " docFreq=" + df);
        }

        if (df < minThreshold) // not freq enough, too rare to consider
        {
            mins++;

            continue;
        }

        String text = t.text();
        int len = text.length();

        if (len < ng1) {
            continue; // too short we bail but "too long" is fine...
        }

        // but note that long tokens that are rare prob won't get here anyway as
        // they won't
        // pass the 'minThreshold' check above
        Document doc = new Document();
        doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term
        doc.add(new Field(F_FREQ, "" + df, Field.Store.YES, Field.Index.UN_TOKENIZED)); // for popularity cutoff optionx

        String[] trans = formTranspositions(text);

        for (int i = 0; i < trans.length; i++)
            doc.add(new Field(F_TRANSPOSITION, trans[i], Field.Store.YES, Field.Index.UN_TOKENIZED));

        // now loop thru all ngrams of lengths 'ng1' to 'ng2'
        for (int ng = ng1; ng <= ng2; ng++) {
            String key = "gram" + ng;
            String end = null;

            for (int i = 0; i < (len - ng + 1); i++) {
                String gram = text.substring(i, i + ng);
                doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));

                if (i == 0) {
                    doc.add(new Field("start" + ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
                }

                end = gram;
                grams++;
            }

            if (end != null) { // may not be present if len==ng1
                doc.add(new Field("end" + ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED));
            }
        }

        float f1 = te.docFreq();
        float f2 = nd;

        float bo = (float) ((Math.log(f1) / Math.log(f2)) + nudge);
        doc.setBoost(bo);

        if (show) {
            o.println("f1=" + f1 + " nd=" + nd + " boost=" + bo + " base=" + base + " word=" + text);
        }

        w.addDocument(doc);
    }

    if (_w == null) // else you have to optimize/close
    {
        w.optimize();
        w.close();
    }

    return grams;
}

From source file:org.apache.solr.codecs.test.testONSQLCodec.java

License:Apache License

public static void main(String[] args) {
    try {/* www .jav  a 2  s  . c  o  m*/
        plaintextDir = assureDirectoryExists(new File(INDEX_ROOT_FOLDER));
        testUtil.initPropsONSQL();
        //----------- index documents -------
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_10_1);
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_1, analyzer);
        // recreate the index on each execution
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        //config.setCodec(new SimpleTextCodec());            
        ONSQLCodec codec = new ONSQLCodec();
        config.setCodec(codec);
        config.setUseCompoundFile(false);
        Directory luceneDir = FSDirectory.open(plaintextDir);
        IndexWriter writer = new IndexWriter(luceneDir, config);
        writer.addDocument(Arrays.asList(new TextField("title", "The title of my first document", Store.YES),
                new TextField("content", "The content of the first document", Store.YES),
                new IntField("intval", 111111, Store.YES), new LongField("longval", 1111111111L, Store.YES)));

        writer.addDocument(Arrays.asList(new TextField("title", "The tAtle of the second document", Store.YES),
                new TextField("content", "The content of the second document", Store.YES),
                new IntField("intval", 222222, Store.YES), new LongField("longval", 222222222L, Store.YES)));
        writer.addDocument(Arrays.asList(new TextField("title", "The title of the third document", Store.YES),
                new TextField("content", "The content of the third document", Store.YES),
                new IntField("intval", 333333, Store.YES), new LongField("longval", 3333333333L, Store.YES)));
        writer.commit();
        writer.close();
        IndexReader reader = DirectoryReader.open(luceneDir);
        // now test for docs
        if (reader.numDocs() < 3)
            throw new IOException("amount of returned docs are less than indexed");
        else
            System.out.println("test passed");
        searchIndex("content", "third");
    } catch (Throwable te) {
        te.printStackTrace();
    }
}

From source file:org.apache.solr.codecs.test.testSimpleTextCodec.java

License:Apache License

public static void main(String[] args) {
    try {/*from   w ww.j  a  va  2 s.c o  m*/
        plaintextDir = assureDirectoryExists(new File(INDEX_ROOT_FOLDER, "plaintext"));

        //----------- index documents -------
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_48);
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer);
        // recreate the index on each execution
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        config.setCodec(new SimpleTextCodec());
        config.setUseCompoundFile(false);
        Directory luceneDir = FSDirectory.open(plaintextDir);
        IndexWriter writer = new IndexWriter(luceneDir, config);
        writer.addDocument(Arrays.asList(new TextField("title", "The title of my first document", Store.YES),
                new TextField("content", "The content of the first document", Store.YES)));

        writer.addDocument(Arrays.asList(new TextField("title", "The tAtle of the second document", Store.YES),
                new TextField("content", "The content of the second document", Store.YES)));
        writer.addDocument(Arrays.asList(new TextField("title", "The title of the third document", Store.YES),
                new TextField("content", "The content of the third document", Store.YES)));
        writer.commit();
        writer.close();
        IndexReader reader = DirectoryReader.open(luceneDir);
        // now test for docs
        if (reader.numDocs() != 3)
            throw new IOException("amount of returned docs are less than indexed");
        else
            System.out.println("test passed");
        searchIndex("content", "third");
    } catch (Throwable te) {
        te.printStackTrace();
    }
}

From source file:org.apache.solr.handler.SpellCheckerRequestHandler.java

License:Apache License

/**
 * Processes the following query string parameters: q, extendedResults, cmd rebuild,
 * cmd reopen, accuracy, suggestionCount, restrictToField, and onlyMorePopular.
 *//*from w w w  .j  a  va  2 s .  c o  m*/
@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
    SolrParams p = req.getParams();
    String words = p.get("q");
    String cmd = p.get("cmd");
    if (cmd != null) {
        cmd = cmd.trim();
        if (cmd.equals("rebuild")) {
            rebuild(req);
            rsp.add("cmdExecuted", "rebuild");
        } else if (cmd.equals("reopen")) {
            reopen();
            rsp.add("cmdExecuted", "reopen");
        } else {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unrecognized Command: " + cmd);
        }
    }

    // empty query string
    if (null == words || "".equals(words.trim())) {
        return;
    }

    IndexReader indexReader = null;
    String suggestionField = null;
    Float accuracy;
    int numSug;
    boolean onlyMorePopular;
    boolean extendedResults;
    try {
        accuracy = p.getFloat(ACCURACY, p.getFloat("accuracy", DEFAULT_ACCURACY));
        spellChecker.setAccuracy(accuracy);
    } catch (NumberFormatException e) {
        throw new RuntimeException("Accuracy must be a valid positive float", e);
    }
    try {
        numSug = p.getInt(SUGGESTIONS, p.getInt("suggestionCount", DEFAULT_SUGGESTION_COUNT));
    } catch (NumberFormatException e) {
        throw new RuntimeException("Spelling suggestion count must be a valid positive integer", e);
    }
    try {
        onlyMorePopular = p.getBool(POPULAR, DEFAULT_MORE_POPULAR);
    } catch (SolrException e) {
        throw new RuntimeException("'Only more popular' must be a valid boolean", e);
    }
    try {
        extendedResults = p.getBool(EXTENDED, DEFAULT_EXTENDED_RESULTS);
    } catch (SolrException e) {
        throw new RuntimeException("'Extended results' must be a valid boolean", e);
    }

    // when searching for more popular, a non null index-reader and
    // restricted-field are required
    if (onlyMorePopular || extendedResults) {
        indexReader = req.getSearcher().getReader();
        suggestionField = termSourceField;
    }

    if (extendedResults) {

        rsp.add("numDocs", indexReader.numDocs());

        SimpleOrderedMap<Object> results = new SimpleOrderedMap<Object>();
        String[] wordz = words.split(" ");
        for (String word : wordz) {
            SimpleOrderedMap<Object> nl = new SimpleOrderedMap<Object>();
            nl.add("frequency", indexReader.docFreq(new Term(suggestionField, word)));
            String[] suggestions = spellChecker.suggestSimilar(word, numSug, indexReader, suggestionField,
                    onlyMorePopular);

            // suggestion array
            NamedList<Object> sa = new NamedList<Object>();
            for (int i = 0; i < suggestions.length; i++) {
                // suggestion item
                SimpleOrderedMap<Object> si = new SimpleOrderedMap<Object>();
                si.add("frequency", indexReader.docFreq(new Term(termSourceField, suggestions[i])));
                sa.add(suggestions[i], si);
            }
            nl.add("suggestions", sa);
            results.add(word, nl);
        }
        rsp.add("result", results);

    } else {
        rsp.add("words", words);
        if (spellChecker.exist(words)) {
            rsp.add("exist", "true");
        } else {
            rsp.add("exist", "false");
        }
        String[] suggestions = spellChecker.suggestSimilar(words, numSug, indexReader, suggestionField,
                onlyMorePopular);

        rsp.add("suggestions", Arrays.asList(suggestions));
    }
}

From source file:org.archive.nutchwax.tools.DateAdder.java

License:LGPL

public int run(String[] args) throws Exception {
    if (args.length < 4) {
        System.out.println("DateAdder <key-index> <source1> ... <sourceN> <dest> <records>");
        System.exit(0);//from   w  ww  .ja  va  2s.  c  o m
    }

    String mainIndexDir = args[0].trim();
    String destIndexDir = args[args.length - 2].trim();
    String recordsFile = args[args.length - 1].trim();

    InputStream recordsStream;
    if ("-".equals(recordsFile)) {
        recordsStream = System.in;
    } else {
        recordsStream = new FileInputStream(recordsFile);
    }

    // Read date-addition records from stdin.
    Map<String, String> dateRecords = new HashMap<String, String>();
    BufferedReader br = new BufferedReader(new InputStreamReader(recordsStream, "UTF-8"));
    String line;
    while ((line = br.readLine()) != null) {
        String fields[] = line.split("\\s+");
        if (fields.length < 3) {
            System.out.println("Malformed line, not enough fields (" + fields.length + "): " + line);
            continue;
        }

        // Key is hash+url, value is String which is a " "-separated list of dates
        String key = fields[0] + fields[1];
        String dates = dateRecords.get(key);
        if (dates != null) {
            dates += " " + fields[2];
            dateRecords.put(key, dates);
        } else {
            dateRecords.put(key, fields[2]);
        }

    }

    IndexReader reader = IndexReader.open(mainIndexDir);

    IndexReader sourceReaders[] = new IndexReader[args.length - 3];
    for (int i = 0; i < sourceReaders.length; i++) {
        sourceReaders[i] = IndexReader.open(args[i + 1]);
    }

    IndexWriter writer = new IndexWriter(destIndexDir, new WhitespaceAnalyzer(), true);

    UrlCanonicalizer canonicalizer = getCanonicalizer(this.getConf());

    for (int i = 0; i < reader.numDocs(); i++) {
        Document oldDoc = reader.document(i);
        Document newDoc = new Document();

        // Copy the values from all the source indices to the new
        // document.
        Set<String> uniqueDates = new HashSet<String>();
        for (IndexReader source : sourceReaders) {
            Document sourceDoc = source.document(i);

            String dates[] = sourceDoc.getValues(NutchWax.DATE_KEY);

            Collections.addAll(uniqueDates, dates);
        }
        for (String date : uniqueDates) {
            newDoc.add(new Field(NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
        }

        // Obtain the new dates for the document.
        String newDates = null;
        try {
            // First, apply URL canonicalization from Wayback
            String canonicalizedUrl = canonicalizer.urlStringToKey(oldDoc.get(NutchWax.URL_KEY));

            // Now, get the digest+URL of the document, look for it in
            // the updateRecords and if found, add the date.
            String key = canonicalizedUrl + oldDoc.get(NutchWax.DIGEST_KEY);

            newDates = dateRecords.get(key);
        } catch (Exception e) {
            // The canonicalizer can throw various types of exceptions
            // due to malformed URIs.
            System.err.println("WARN: Not adding dates on malformed URI: " + oldDoc.get(NutchWax.URL_KEY));
        }

        // If there are any new dates, add them to the new document.
        if (newDates != null) {
            for (String date : newDates.split("\\s+")) {
                newDoc.add(new Field(NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
            }
        }

        // Finally, add the new document to the new index.
        writer.addDocument(newDoc);
    }

    reader.close();
    writer.close();

    return 0;
}

From source file:org.archive.nutchwax.tools.DumpParallelIndex.java

License:LGPL

private static void dumpIndex(IndexReader reader, String fieldName) throws Exception {
    Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);

    if (!fieldNames.contains(fieldName)) {
        System.out.println("Field not in index: " + fieldName);
        System.exit(2);/*www.j  a v a  2  s .  com*/
    }

    int numDocs = reader.numDocs();

    for (int i = 0; i < numDocs; i++) {
        System.out.println(Arrays.toString(reader.document(i).getValues((String) fieldName)));
    }

}

From source file:org.archive.nutchwax.tools.DumpParallelIndex.java

License:LGPL

private static void dumpIndex(IndexReader reader) throws Exception {
    Object[] fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL).toArray();
    Arrays.sort(fieldNames);/*from  w  w w. jav a  2s.c  o  m*/

    for (int i = 0; i < fieldNames.length; i++) {
        System.out.print(fieldNames[i] + "\t");
    }

    System.out.println();

    int numDocs = reader.numDocs();

    for (int i = 0; i < numDocs; i++) {
        for (int j = 0; j < fieldNames.length; j++) {
            System.out.print(Arrays.toString(reader.document(i).getValues((String) fieldNames[j])) + "\t");
        }

        System.out.println();
    }
}

From source file:org.archive.nutchwax.tools.DumpParallelIndex.java

License:LGPL

private static void countDocs(IndexReader reader) throws Exception {
    System.out.println(reader.numDocs());
}