Example usage for org.apache.lucene.index IndexReader document

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader document.

Prototype




public final Document document(int docID) throws IOException

Source Link

Document

Returns the stored fields of the n^th Document in this index.

Usage

From source file:org.arastreju.sge.spi.impl.LuceneBasedNodeKeyTable.java

License:Apache License

@Override
public T lookup(QualifiedName qualifiedName) {
    TermQuery query = new TermQuery(new Term(QN, qualifiedName.toURI()));
    IndexReader reader = reader();
    try {//from w  ww  .j  a va2 s  . c o  m

        IndexSearcher searcher = new IndexSearcher(reader);
        TopDocs result = searcher.search(query, 2);
        searcher.close();

        if (result.scoreDocs.length == 1) {
            Document document = reader.document(result.scoreDocs[0].doc);
            return createID(document);
        } else if (result.scoreDocs.length == 0) {
            return null;
        } else {
            throw new IllegalStateException(
                    "Found more than one document for qualified name: " + qualifiedName);
        }
    } catch (IOException e) {
        throw new RuntimeException("Could not query by qualified name.", e);
    } finally {
        try {
            reader.close();
        } catch (IOException e) {
            throw new RuntimeException("Could not close index reader.", e);
        }
    }
}

From source file:org.archive.nutchwax.tools.DateAdder.java

License:LGPL

public int run(String[] args) throws Exception {
    if (args.length < 4) {
        System.out.println("DateAdder <key-index> <source1> ... <sourceN> <dest> <records>");
        System.exit(0);/*from  w  w w .j  a  v  a2 s .  c  o  m*/
    }

    String mainIndexDir = args[0].trim();
    String destIndexDir = args[args.length - 2].trim();
    String recordsFile = args[args.length - 1].trim();

    InputStream recordsStream;
    if ("-".equals(recordsFile)) {
        recordsStream = System.in;
    } else {
        recordsStream = new FileInputStream(recordsFile);
    }

    // Read date-addition records from stdin.
    Map<String, String> dateRecords = new HashMap<String, String>();
    BufferedReader br = new BufferedReader(new InputStreamReader(recordsStream, "UTF-8"));
    String line;
    while ((line = br.readLine()) != null) {
        String fields[] = line.split("\\s+");
        if (fields.length < 3) {
            System.out.println("Malformed line, not enough fields (" + fields.length + "): " + line);
            continue;
        }

        // Key is hash+url, value is String which is a " "-separated list of dates
        String key = fields[0] + fields[1];
        String dates = dateRecords.get(key);
        if (dates != null) {
            dates += " " + fields[2];
            dateRecords.put(key, dates);
        } else {
            dateRecords.put(key, fields[2]);
        }

    }

    IndexReader reader = IndexReader.open(mainIndexDir);

    IndexReader sourceReaders[] = new IndexReader[args.length - 3];
    for (int i = 0; i < sourceReaders.length; i++) {
        sourceReaders[i] = IndexReader.open(args[i + 1]);
    }

    IndexWriter writer = new IndexWriter(destIndexDir, new WhitespaceAnalyzer(), true);

    UrlCanonicalizer canonicalizer = getCanonicalizer(this.getConf());

    for (int i = 0; i < reader.numDocs(); i++) {
        Document oldDoc = reader.document(i);
        Document newDoc = new Document();

        // Copy the values from all the source indices to the new
        // document.
        Set<String> uniqueDates = new HashSet<String>();
        for (IndexReader source : sourceReaders) {
            Document sourceDoc = source.document(i);

            String dates[] = sourceDoc.getValues(NutchWax.DATE_KEY);

            Collections.addAll(uniqueDates, dates);
        }
        for (String date : uniqueDates) {
            newDoc.add(new Field(NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
        }

        // Obtain the new dates for the document.
        String newDates = null;
        try {
            // First, apply URL canonicalization from Wayback
            String canonicalizedUrl = canonicalizer.urlStringToKey(oldDoc.get(NutchWax.URL_KEY));

            // Now, get the digest+URL of the document, look for it in
            // the updateRecords and if found, add the date.
            String key = canonicalizedUrl + oldDoc.get(NutchWax.DIGEST_KEY);

            newDates = dateRecords.get(key);
        } catch (Exception e) {
            // The canonicalizer can throw various types of exceptions
            // due to malformed URIs.
            System.err.println("WARN: Not adding dates on malformed URI: " + oldDoc.get(NutchWax.URL_KEY));
        }

        // If there are any new dates, add them to the new document.
        if (newDates != null) {
            for (String date : newDates.split("\\s+")) {
                newDoc.add(new Field(NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED));
            }
        }

        // Finally, add the new document to the new index.
        writer.addDocument(newDoc);
    }

    reader.close();
    writer.close();

    return 0;
}

From source file:org.archive.nutchwax.tools.DumpParallelIndex.java

License:LGPL

private static void dumpIndex(IndexReader reader, String fieldName) throws Exception {
    Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);

    if (!fieldNames.contains(fieldName)) {
        System.out.println("Field not in index: " + fieldName);
        System.exit(2);//from  w  w w.  j a  v  a 2 s  .c om
    }

    int numDocs = reader.numDocs();

    for (int i = 0; i < numDocs; i++) {
        System.out.println(Arrays.toString(reader.document(i).getValues((String) fieldName)));
    }

}

From source file:org.archive.nutchwax.tools.DumpParallelIndex.java

License:LGPL

private static void dumpIndex(IndexReader reader) throws Exception {
    Object[] fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL).toArray();
    Arrays.sort(fieldNames);//from   w ww . ja v  a2  s  . c o m

    for (int i = 0; i < fieldNames.length; i++) {
        System.out.print(fieldNames[i] + "\t");
    }

    System.out.println();

    int numDocs = reader.numDocs();

    for (int i = 0; i < numDocs; i++) {
        for (int j = 0; j < fieldNames.length; j++) {
            System.out.print(Arrays.toString(reader.document(i).getValues((String) fieldNames[j])) + "\t");
        }

        System.out.println();
    }
}

From source file:org.archive.nutchwax.tools.GetUniqFieldValues.java

License:LGPL

private static void dumpUniqValues(String fieldName, String indexDir) throws Exception {
    IndexReader reader = IndexReader.open(indexDir);

    Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);

    if (!fieldNames.contains(fieldName)) {
        System.out.println("Field not in index: " + fieldName);
        System.exit(2);//from   ww  w  .j av a2  s.  c om
    }

    int numDocs = reader.numDocs();
    Set<String> values = new HashSet<String>();

    for (int i = 0; i < numDocs; i++) {
        values.add(reader.document(i).get(fieldName));
    }

    for (String v : values) {
        System.out.println(v);
    }

}

From source file:org.archive.nutchwax.tools.LengthNormUpdater.java

License:Apache License

/**
 *
 *///from w w w . j  ava 2s  . c  o  m
public static void reSetNorms(IndexReader reader, String fieldName, Map<String, Integer> ranks, Similarity sim)
        throws IOException {
    if (VERBOSE > 0)
        System.out.println("Updating field: " + fieldName);

    int[] termCounts = new int[0];

    TermEnum termEnum = null;
    TermDocs termDocs = null;

    termCounts = new int[reader.maxDoc()];
    try {
        termEnum = reader.terms(new Term(fieldName, ""));
        try {
            termDocs = reader.termDocs();
            do {
                Term term = termEnum.term();
                if (term != null && term.field().equals(fieldName)) {
                    termDocs.seek(termEnum.term());
                    while (termDocs.next()) {
                        termCounts[termDocs.doc()] += termDocs.freq();
                    }
                }
            } while (termEnum.next());
        } finally {
            if (null != termDocs)
                termDocs.close();
        }
    } finally {
        if (null != termEnum)
            termEnum.close();
    }

    for (int d = 0; d < termCounts.length; d++) {
        if (!reader.isDeleted(d)) {
            Document doc = reader.document(d);

            String url = doc.get("url");

            if (url != null) {
                Integer rank = ranks.get(url);
                if (rank == null)
                    continue;

                float originalNorm = sim.lengthNorm(fieldName, termCounts[d]);
                byte encodedOrig = sim.encodeNorm(originalNorm);
                float rankedNorm = originalNorm * (float) (Math.log10(rank) + 1);
                byte encodedRank = sim.encodeNorm(rankedNorm);

                if (VERBOSE > 1)
                    System.out.println(fieldName + "\t" + d + "\t" + originalNorm + "\t" + encodedOrig + "\t"
                            + rankedNorm + "\t" + encodedRank);

                reader.setNorm(d, fieldName, encodedRank);
            }
        }
    }
}

From source file:org.archive.tnh.tools.IndexDumper.java

License:Apache License

private static void dumpIndex(IndexReader reader, List<String> fields, boolean includeDocIds) throws Exception {
    Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL);

    // If no fields were specified, then dump them all.
    if (fields.size() == 0) {
        fields.addAll(fieldNames);/*from  w w w. j a v a  2s .  co m*/
    } else {
        for (String field : fields) {
            if (!fieldNames.contains(field)) {
                System.out.println("Field not in index: " + field);
                System.exit(2);
            }
        }
    }

    int numDocs = reader.numDocs();

    for (int i = 0; i < numDocs; i++) {
        if (includeDocIds) {
            System.out.print(i + "\t");
        }

        for (String field : fields) {
            System.out.print(Arrays.toString(reader.document(i).getValues(field)));
            System.out.print("\t");
        }

        System.out.println();
    }

}

From source file:org.archive.tnh.tools.LengthNormUpdater.java

License:Apache License

/**
 *
 *//*from   w  w w  .j a  v  a 2s . co m*/
public static void updateNorms(IndexReader reader, String fieldName, Map<String, Integer> ranks, Similarity sim)
        throws IOException {
    if (VERBOSE > 0)
        System.out.println("Updating field: " + fieldName);

    int[] termCounts = new int[0];

    TermEnum termEnum = null;
    TermDocs termDocs = null;

    termCounts = new int[reader.maxDoc()];
    try {
        termEnum = reader.terms(new Term(fieldName, ""));
        try {
            termDocs = reader.termDocs();
            do {
                Term term = termEnum.term();
                if (term != null && term.field().equals(fieldName)) {
                    termDocs.seek(termEnum.term());
                    while (termDocs.next()) {
                        termCounts[termDocs.doc()] += termDocs.freq();
                    }
                }
            } while (termEnum.next());
        } finally {
            if (null != termDocs)
                termDocs.close();
        }
    } finally {
        if (null != termEnum)
            termEnum.close();
    }

    for (int d = 0; d < termCounts.length; d++) {
        if (!reader.isDeleted(d)) {
            Document doc = reader.document(d);

            String url = doc.get("url");

            if (url != null) {
                Integer rank = ranks.get(url);
                if (rank == null)
                    continue;

                float originalNorm = sim.lengthNorm(fieldName, termCounts[d]);
                byte encodedOrig = sim.encodeNorm(originalNorm);
                float rankedNorm = originalNorm * (float) (Math.log10(rank) + 1);
                byte encodedRank = sim.encodeNorm(rankedNorm);

                if (VERBOSE > 1)
                    System.out.println(fieldName + "\t" + d + "\t" + originalNorm + "\t" + encodedOrig + "\t"
                            + rankedNorm + "\t" + encodedRank);

                reader.setNorm(d, fieldName, encodedRank);
            }
        }
    }
}

From source file:org.archive.tnh.tools.TermDumper.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 2) {
        System.err.println("TermDumper [-c|-v value] field <index...>");
        System.exit(1);//from  w w w.ja v a  2s .c  om
    }

    boolean count = false;
    String value = null;
    boolean all = false;

    int i = 0;
    for (; i < args.length; i++) {
        String arg = args[i];

        if ("-h".equals(arg) || "--help".equals(arg)) {
            System.err.println("TermDumper [-c|-v value] field <index...>");
            System.exit(1);
        } else if ("-c".equals(arg) || "--count".equals(arg)) {
            count = true;
        } else if ("-v".equals(arg) || "--vaue".equals(arg)) {
            value = args[++i];
        } else if ("-a".equals(arg) || "--all".equals(arg)) {
            all = true;
        } else {
            break;
        }
    }

    String field = args[i++];

    java.util.ArrayList<IndexReader> readers = new java.util.ArrayList<IndexReader>(args.length - 1);
    for (; i < args.length; i++) {
        String arg = args[i];
        try {
            IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true);

            readers.add(reader);
        } catch (IOException ioe) {
            System.err.println("Error reading: " + arg);
        }
    }

    for (IndexReader reader : readers) {
        TermDocs termDocs = reader.termDocs();
        TermEnum termEnum = reader.terms(new Term(field));

        try {
            do {
                Term term = termEnum.term();

                if (term == null || !field.equals(term.field()))
                    break;

                if (value == null) {
                    if (count) {
                        termDocs.seek(termEnum);

                        int c = 0;
                        for (; termDocs.next(); c++)
                            ;

                        System.out.print(c + " ");
                    }
                    System.out.println(term.text());
                } else if (value.equals(term.text())) {
                    termDocs.seek(termEnum);

                    while (termDocs.next()) {
                        if (all) {
                            Document d = reader.document(termDocs.doc());
                            System.out.println(termDocs.doc());
                            for (Object o : d.getFields()) {
                                Field f = (Field) o;
                                System.out.println(f.name() + " " + d.get(f.name()));
                            }
                        } else {
                            System.out
                                    .println(termDocs.doc() + " " + reader.document(termDocs.doc()).get("url"));
                        }
                    }
                }
            } while (termEnum.next());
        } finally {
            termDocs.close();
            termEnum.close();
        }

    }

}

From source file:org.capelin.transaction.utils.TXLuceneRecordImporter.java

License:GNU General Public License

protected int importRecords(IndexReader reader, Session session) throws IOException {
    CapelinRecord data = null;//  w w  w .  j  a va  2 s .c  o  m
    int totalDoc = reader.numDocs();
    // Read documents
    for (int i = 0; i < totalDoc; i++) {
        data = buildRecord(reader.document(i));
        if (null != data)
            session.save(data);
        if (i % BATCH_SIZE == 0) {
            session.flush(); // apply changes to indexes
            session.clear(); // free memory since the queue is processed
            log.info(i);
        }
    }
    return totalDoc;
}