Example usage for org.apache.lucene.index IndexReader numDocs

List of usage examples for org.apache.lucene.index IndexReader numDocs

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader numDocs.

Prototype

public abstract int numDocs();

Source Link

Document

Returns the number of documents in this index.

Usage

From source file:org.apache.nutch.indexer.TestIndexSorter.java

License:Apache License

public void testSorting() throws Exception {
    IndexSorter sorter = new IndexSorter(conf);
    sorter.sort(testDir);//from   www .  j  a v a 2  s. co m

    // read back documents
    IndexReader reader = IndexReader.open(FSDirectory.open(new File(testDir, INDEX_SORTED)));
    assertEquals(reader.numDocs(), NUM_DOCS);
    for (int i = 0; i < reader.maxDoc(); i++) {
        Document doc = reader.document(i);
        Field f = doc.getField("content");
        assertNull(f);
        f = doc.getField("boost");
        float boost = Similarity.decodeNorm((byte) (NUM_DOCS - i));
        String cmp = String.valueOf(boost);
        assertEquals(cmp, f.stringValue());
    }
    reader.close();
}

From source file:org.apache.nutch.spell.NGramSpeller.java

License:Apache License

/**
 * Main driver, used to build an index. You probably want invoke like this:
 * <br>/*ww  w  . jav  a 2s.  c o m*/
 * <code>
 * java org.apache.lucene.spell.NGramSpeller -f contents -i orig_index -o ngram_index
 * </code>
 */
public static void main(String[] args) throws Throwable {
    int minThreshold = 5;
    int ng1 = 3;
    int ng2 = 4;
    int maxr = 10;
    int maxd = 5;
    String out = "gram_index";
    String gi = "gram_index";

    String name = null;
    String field = "contents";

    for (int i = 0; i < args.length; i++) {
        if (args[i].equals("-i")) {
            name = args[++i];
        } else if (args[i].equals("-minThreshold")) {
            minThreshold = Integer.parseInt(args[++i]);
        } else if (args[i].equals("-gi")) {
            gi = args[++i];
        } else if (args[i].equals("-o")) {
            out = args[++i];
        } else if (args[i].equals("-t")) { // test transpositions

            String s = args[++i];
            o.println("TRANS: " + s);

            String[] ar = formTranspositions(s);

            for (int j = 0; j < ar.length; j++)
                o.println("\t" + ar[j]);

            System.exit(0);
        } else if (args[i].equals("-ng1")) {
            ng1 = Integer.parseInt(args[++i]);
        } else if (args[i].equals("-ng2")) {
            ng2 = Integer.parseInt(args[++i]);
        } else if (args[i].equals("-help") || args[i].equals("--help") || args[i].equals("-h")) {
            o.println("To form an ngram index:");
            o.println("NGramSpeller -i ORIG_INDEX -o NGRAM_INDEX [-ng1 MIN] [-ng2 MAX] [-f FIELD]");
            o.println("Defaults are ng1=3, ng2=4, field='contents'");
            System.exit(100);
        } else if (args[i].equals("-q")) {
            String goal = args[++i];
            o.println("[NGrams] for " + goal + " from " + gi);

            float bStart = 2.0f;
            float bEnd = 1.0f;
            float bTransposition = 0f;

            o.println("bStart: " + bStart);
            o.println("bEnd: " + bEnd);
            o.println("bTrans: " + bTransposition);
            o.println("ng1: " + ng1);
            o.println("ng2: " + ng2);

            IndexReader ir = IndexReader.open(gi);
            IndexSearcher searcher = new IndexSearcher(gi);
            List lis = new ArrayList(maxr);
            String[] res = suggestUsingNGrams(searcher, goal, ng1, ng2, maxr, bStart, bEnd, bTransposition,
                    maxd, lis, true); // more popular
            o.println("Returned " + res.length + " from " + gi + " which has " + ir.numDocs() + " words in it");

            Iterator it = lis.iterator();

            while (it.hasNext()) {
                o.println(it.next().toString());
            }

            o.println();
            o.println("query: " + lastQuery.toString("contents"));

            Hits ghits = searcher.search(new TermQuery(new Term(F_WORD, "recursive")));

            if (ghits.length() >= 1) // umm, should only be 0 or 1
            {
                Document doc = ghits.doc(0);
                o.println("TEST DOC: " + doc);
            }

            searcher.close();
            ir.close();

            return;
        } else if (args[i].equals("-f")) {
            field = args[++i];
        } else {
            o.println("hmm? " + args[i]);
            System.exit(1);
        }
    }

    if (name == null) {
        o.println("opps, you need to specify the input index w/ -i");
        System.exit(1);
    }

    o.println("Opening " + name);
    IndexReader.unlock(FSDirectory.getDirectory(name, false));

    final IndexReader r = IndexReader.open(name);

    o.println("Docs: " + nf.format(r.numDocs()));
    o.println("Using field: " + field);

    IndexWriter writer = new IndexWriter(out, new WhitespaceAnalyzer(), true);
    writer.setMergeFactor(writer.getMergeFactor() * 50);
    writer.setMaxBufferedDocs(writer.getMaxBufferedDocs() * 50);

    o.println("Forming index from " + name + " to " + out);

    int res = formNGramIndex(r, writer, ng1, ng2, field, minThreshold);

    o.println("done, did " + res + " ngrams");
    writer.optimize();
    writer.close();
    r.close();
}

From source file:org.apache.nutch.spell.NGramSpeller.java

License:Apache License

/**
 * Go thru all terms and form an index of the "ngrams" of length 'ng1' to
 * 'ng2' in each term. The ngrams have field names like "gram3" for a 3 char
 * ngram, and "gram4" for a 4 char one. The starting and ending (or prefix and
 * suffix) "n" characters are also stored for each word with field names
 * "start3" and "end3"./*from w  w w.j ava 2  s  . co  m*/
 * 
 * 
 * @param r
 *          the index to read terms from
 * 
 * @param w
 *          the writer to write the ngrams to, or if null an index named
 *          "gram_index" will be created. If you pass in non-null then you
 *          should optimize and close the index.
 * 
 * @param ng1
 *          the min number of chars to form ngrams with (3 is suggested)
 * 
 * @param ng2
 *          the max number of chars to form ngrams with, can be equal to ng1
 * 
 * @param fields
 *          the field name to process ngrams from.
 * 
 * @param minThreshold
 *          terms must appear in at least this many docs else they're ignored
 *          as the assumption is that they're so rare (...)
 * 
 * @return the number of ngrams added
 * 
 */
private static int formNGramIndex(IndexReader r, IndexWriter _w, int ng1, int ng2, String field,
        int minThreshold) throws IOException {
    int mins = 0;
    float nudge = 0.01f; // don't allow boosts to be too small
    IndexWriter w;

    if (_w == null) {
        w = new IndexWriter("gram_index", new WhitespaceAnalyzer(), // should have
                // no effect
                true);
    } else {
        w = _w;
    }

    int mod = 1000; // for status
    int nd = r.numDocs();
    final float base = (float) Math.log(1.0d / ((double) nd));

    if (field == null) {
        field = "contents"; // def field
    }

    field = field.intern(); // is it doced that you can use == on fields?

    int grams = 0; // # of ngrams added
    final TermEnum te = r.terms(new Term(field, ""));
    int n = 0;
    int skips = 0;

    while (te.next()) {
        boolean show = false; // for debugging
        Term t = te.term();
        String have = t.field();

        if ((have != field) && !have.equals(field)) // wrong field
        {
            break;
        }

        if (t.text().indexOf('-') >= 0) {
            continue;
        }

        int df = te.docFreq();

        if ((++n % mod) == 0) {
            show = true;
            o.println("term: " + t + " n=" + nf.format(n) + " grams=" + nf.format(grams) + " mins="
                    + nf.format(mins) + " skip=" + nf.format(skips) + " docFreq=" + df);
        }

        if (df < minThreshold) // not freq enough, too rare to consider
        {
            mins++;

            continue;
        }

        String text = t.text();
        int len = text.length();

        if (len < ng1) {
            continue; // too short we bail but "too long" is fine...
        }

        // but note that long tokens that are rare prob won't get here anyway as
        // they won't
        // pass the 'minThreshold' check above
        Document doc = new Document();
        doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term
        doc.add(new Field(F_FREQ, "" + df, Field.Store.YES, Field.Index.UN_TOKENIZED)); // for popularity cutoff optionx

        String[] trans = formTranspositions(text);

        for (int i = 0; i < trans.length; i++)
            doc.add(new Field(F_TRANSPOSITION, trans[i], Field.Store.YES, Field.Index.UN_TOKENIZED));

        // now loop thru all ngrams of lengths 'ng1' to 'ng2'
        for (int ng = ng1; ng <= ng2; ng++) {
            String key = "gram" + ng;
            String end = null;

            for (int i = 0; i < (len - ng + 1); i++) {
                String gram = text.substring(i, i + ng);
                doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));

                if (i == 0) {
                    doc.add(new Field("start" + ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
                }

                end = gram;
                grams++;
            }

            if (end != null) { // may not be present if len==ng1
                doc.add(new Field("end" + ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED));
            }
        }

        float f1 = te.docFreq();
        float f2 = nd;

        float bo = (float) ((Math.log(f1) / Math.log(f2)) + nudge);
        doc.setBoost(bo);

        if (show) {
            o.println("f1=" + f1 + " nd=" + nd + " boost=" + bo + " base=" + base + " word=" + text);
        }

        w.addDocument(doc);
    }

    if (_w == null) // else you have to optimize/close
    {
        w.optimize();
        w.close();
    }

    return grams;
}

From source file:org.apache.solr.codecs.test.testONSQLCodec.java

License:Apache License

public static void main(String[] args) {
    try {/* www .jav  a 2  s  . c  o  m*/
        plaintextDir = assureDirectoryExists(new File(INDEX_ROOT_FOLDER));
        testUtil.initPropsONSQL();
        //----------- index documents -------
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_10_1);
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_10_1, analyzer);
        // recreate the index on each execution
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        //config.setCodec(new SimpleTextCodec());            
        ONSQLCodec codec = new ONSQLCodec();
        config.setCodec(codec);
        config.setUseCompoundFile(false);
        Directory luceneDir = FSDirectory.open(plaintextDir);
        IndexWriter writer = new IndexWriter(luceneDir, config);
        writer.addDocument(Arrays.asList(new TextField("title", "The title of my first document", Store.YES),
                new TextField("content", "The content of the first document", Store.YES),
                new IntField("intval", 111111, Store.YES), new LongField("longval", 1111111111L, Store.YES)));

        writer.addDocument(Arrays.asList(new TextField("title", "The tAtle of the second document", Store.YES),
                new TextField("content", "The content of the second document", Store.YES),
                new IntField("intval", 222222, Store.YES), new LongField("longval", 222222222L, Store.YES)));
        writer.addDocument(Arrays.asList(new TextField("title", "The title of the third document", Store.YES),
                new TextField("content", "The content of the third document", Store.YES),
                new IntField("intval", 333333, Store.YES), new LongField("longval", 3333333333L, Store.YES)));
        writer.commit();
        writer.close();
        IndexReader reader = DirectoryReader.open(luceneDir);
        // now test for docs
        if (reader.numDocs() < 3)
            throw new IOException("amount of returned docs are less than indexed");
        else
            System.out.println("test passed");
        searchIndex("content", "third");
    } catch (Throwable te) {
        te.printStackTrace();
    }
}

From source file:org.apache.solr.codecs.test.testSimpleTextCodec.java

License:Apache License

public static void main(String[] args) {
    try {/*from   w ww.j  a  va  2 s.c o  m*/
        plaintextDir = assureDirectoryExists(new File(INDEX_ROOT_FOLDER, "plaintext"));

        //----------- index documents -------
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_48);
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_48, analyzer);
        // recreate the index on each execution
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        config.setCodec(new SimpleTextCodec());
        config.setUseCompoundFile(false);
        Directory luceneDir = FSDirectory.open(plaintextDir);
        IndexWriter writer = new IndexWriter(luceneDir, config);
        writer.addDocument(Arrays.asList(new TextField("title", "The title of my first document", Store.YES),
                new TextField("content", "The content of the first document", Store.YES)));

        writer.addDocument(Arrays.asList(new TextField("title", "The tAtle of the second document", Store.YES),
                new TextField("content", "The content of the second document", Store.YES)));
        writer.addDocument(Arrays.asList(new TextField("title", "The title of the third document", Store.YES),
                new TextField("content", "The content of the third document", Store.YES)));
        writer.commit();
        writer.close();
        IndexReader reader = DirectoryReader.open(luceneDir);
        // now test for docs
        if (reader.numDocs() != 3)
            throw new IOException("amount of returned docs are less than indexed");
        else
            System.out.println("test passed");
        searchIndex("content", "third");
    } catch (Throwable te) {
        te.printStackTrace();
    }
}

From source file:org.apache.solr.handler.SpellCheckerRequestHandler.java

License:Apache License

/**
 * Processes the following query string parameters: q, extendedResults, cmd rebuild,
 * cmd reopen, accuracy, suggestionCount, restrictToField, and onlyMorePopular.
 *//*from w w w  .j  a  va  2 s .  c o  m*/
@Override
public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp) throws Exception {
    SolrParams p = req.getParams();
    String words = p.get("q");
    String cmd = p.get("cmd");
    if (cmd != null) {
        cmd = cmd.trim();
        if (cmd.equals("rebuild")) {
            rebuild(req);
            rsp.add("cmdExecuted", "rebuild");
        } else if (cmd.equals("reopen")) {
            reopen();
            rsp.add("cmdExecuted", "reopen");
        } else {
            throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Unrecognized Command: " + cmd);
        }
    }

    // empty query string
    if (null == words || "".equals(words.trim())) {
        return;
    }

    IndexReader indexReader = null;
    String suggestionField = null;
    Float accuracy;
    int numSug;
    boolean onlyMorePopular;
    boolean extendedResults;
    try {
        accuracy = p.getFloat(ACCURACY, p.getFloat("accuracy", DEFAULT_ACCURACY));
        spellChecker.setAccuracy(accuracy);
    } catch (NumberFormatException e) {
        throw new RuntimeException("Accuracy must be a valid positive float", e);
    }
    try {
        numSug = p.getInt(SUGGESTIONS, p.getInt("suggestionCount", DEFAULT_SUGGESTION_COUNT));
    } catch (NumberFormatException e) {
        throw new RuntimeException("Spelling suggestion count must be a valid positive integer", e);
    }
    try {
        onlyMorePopular = p.getBool(POPULAR, DEFAULT_MORE_POPULAR);
    } catch (SolrException e) {
        throw new RuntimeException("'Only more popular' must be a valid boolean", e);
    }
    try {
        extendedResults = p.getBool(EXTENDED, DEFAULT_EXTENDED_RESULTS);
    } catch (SolrException e) {
        throw new RuntimeException("'Extended results' must be a valid boolean", e);
    }

    // when searching for more popular, a non null index-reader and
    // restricted-field are required
    if (onlyMorePopular || extendedResults) {
        indexReader = req.getSearcher().getReader();
        suggestionField = termSourceField;
    }

    if (extendedResults) {

        rsp.add("numDocs", indexReader.numDocs());

        SimpleOrderedMap<Object> results = new SimpleOrderedMap<Object>();
        String[] wordz = words.split(" ");
        for (String word : wordz) {
            SimpleOrderedMap<Object> nl = new SimpleOrderedMap<Object>();
            nl.add("frequency", indexReader.docFreq(new Term(suggestionField, word)));
            String[] suggestions = spellChecker.suggestSimilar(word, numSug, indexReader, suggestionField,
                    onlyMorePopular);

            // suggestion array
            NamedList<Object> sa = new NamedList<Object>();
            for (int i = 0; i < suggestions.length; i++) {
                // suggestion item
                SimpleOrderedMap<Object> si = new SimpleOrderedMap<Object>();
                si.add("frequency", indexReader.docFreq(new Term(termSourceField, suggestions[i])));
                sa.add(suggestions[i], si);
            }
            nl.add("suggestions", sa);
            results.add(word, nl);
        }
        rsp.add("result", results);

    } else {
        rsp.add("words", words);
        if (spellChecker.exist(words)) {
            rsp.add("exist", "true");
        } else {
            rsp.add("exist", "false");
        }
        String[] suggestions = spellChecker.suggestSimilar(words, numSug, indexReader, suggestionField,
                onlyMorePopular);

        rsp.add("suggestions", Arrays.asList(suggestions));
    }
}

From source file:org.archive.nutchwax.tools.DateAdder.java

License:LGPL

public int run(String[] args) throws Exception {
    if (args.length < 4) {
        System.out.println("DateAdder <key-index> <source1> ... <sourceN> <dest> <records>");
        System.exit(0);//from   w  ww  .ja  va  2s.  c  o m
    }

    String mainIndexDir = args[0].trim();
    String destIndexDir = args[args.length - 2].trim();
    String recordsFile = args[args.length - 1].trim();

    InputStream recordsStream;
    if ("-".equals(recordsFile)) {
        recordsStream = System.in;
    } else {
        recordsStream = new FileInputStream(recordsFile);
    }

    // Read date-addition records from stdin.
    Map<String, String> dateRecords = new HashMap<String, String>();
    BufferedReader br = new BufferedReader(new InputStreamReader(recordsStream, "UTF-8"));
    String line;
    while ((line = br.readLine()) != null) {
        String fields[] = line.split("\\s+");
        if (fields.length < 3) {
            System.out.println("Malformed line, not enough fields (" + fields.length + "): " + line);
            continue;
        }

        // Key is hash+url, value is String which is a " "-separated list of dates
        String key = fields[0] + fields[1];
        String dates = dateRecords.get(key);
        if (dates != null) {
            dates += " " + fields[2];
            dateRecords.put(key, dates);
        } else {
            dateRecords.put(key, fields[2]);
        }

    }

    IndexReader reader = IndexReader.open(mainIndexDir);

    IndexReader sourceReaders[] = new IndexReader[args.length - 3];
    for (int i = 0; i < sourceReaders.length; i++) {
        sourceReaders[i] = IndexReader.open(args[i + 1]);
    }

    IndexWriter writer = new IndexWriter(destIndexDir, new WhitespaceAnalyzer(), true);

    UrlCanonicalizer canonicalizer = getCanonicalizer(this.getConf());

    for (int i = 0; i < reader.numDocs(); i++) {
        Document oldDoc = reader.document(i);
        Document newDoc = new Document();

        // Copy the values from all the source indices to the new
        // document.
        Set<String> uniqueDates = new HashSet<String>();
        for (IndexReader source : sourceReaders) {
            Document sourceDoc = source.document(i);

            String dates[] = sourceDoc.getValues(NutchWax.DATE_KEY);

            Collections.addAll(uniqueDates, dates);
        }
        for (String date : uniqueDates) {
            newDoc.add(new Field(NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
        }

        // Obtain the new dates for the document.
        String newDates = null;
        try {
            // First, apply URL canonicalization from Wayback
            String canonicalizedUrl = canonicalizer.urlStringToKey(oldDoc.get(NutchWax.URL_KEY));

            // Now, get the digest+URL of the document, look for it in
            // the updateRecords and if found, add the date.
            String key = canonicalizedUrl + oldDoc.get(NutchWax.DIGEST_KEY);

            newDates = dateRecords.get(key);
        } catch (Exception e) {
            // The canonicalizer can throw various types of exceptions
            // due to malformed URIs.
            System.err.println("WARN: Not adding dates on malformed URI: " + oldDoc.get(NutchWax.URL_KEY));
        }

        // If there are any new dates, add them to the new document.
        if (newDates != null) {
            for (String date : newDates.split("\\s+")) {
                newDoc.add(new Field(NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
            }
        }

        // Finally, add the new document to the new index.
        writer.addDocument(newDoc);
    }

    reader.close();
    writer.close();

    return 0;
}

From source file:org.archive.nutchwax.tools.DumpParallelIndex.java

License:LGPL

private static void dumpIndex(IndexReader reader, String fieldName) throws Exception {
    Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);

    if (!fieldNames.contains(fieldName)) {
        System.out.println("Field not in index: " + fieldName);
        System.exit(2);/*www.j  a v a  2  s .  com*/
    }

    int numDocs = reader.numDocs();

    for (int i = 0; i < numDocs; i++) {
        System.out.println(Arrays.toString(reader.document(i).getValues((String) fieldName)));
    }

}

From source file:org.archive.nutchwax.tools.DumpParallelIndex.java

License:LGPL

private static void dumpIndex(IndexReader reader) throws Exception {
    Object[] fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL).toArray();
    Arrays.sort(fieldNames);/*from  w  w w. jav a  2s.c  o  m*/

    for (int i = 0; i < fieldNames.length; i++) {
        System.out.print(fieldNames[i] + "\t");
    }

    System.out.println();

    int numDocs = reader.numDocs();

    for (int i = 0; i < numDocs; i++) {
        for (int j = 0; j < fieldNames.length; j++) {
            System.out.print(Arrays.toString(reader.document(i).getValues((String) fieldNames[j])) + "\t");
        }

        System.out.println();
    }
}

From source file:org.archive.nutchwax.tools.DumpParallelIndex.java

License:LGPL

private static void countDocs(IndexReader reader) throws Exception {
    System.out.println(reader.numDocs());
}