List of usage examples for org.apache.lucene.index IndexReader document
public final Document document(int docID) throws IOException
n
th Document
in this index. From source file:org.arastreju.sge.spi.impl.LuceneBasedNodeKeyTable.java
License:Apache License
@Override public T lookup(QualifiedName qualifiedName) { TermQuery query = new TermQuery(new Term(QN, qualifiedName.toURI())); IndexReader reader = reader(); try {//from w ww .j a va2 s . c o m IndexSearcher searcher = new IndexSearcher(reader); TopDocs result = searcher.search(query, 2); searcher.close(); if (result.scoreDocs.length == 1) { Document document = reader.document(result.scoreDocs[0].doc); return createID(document); } else if (result.scoreDocs.length == 0) { return null; } else { throw new IllegalStateException( "Found more than one document for qualified name: " + qualifiedName); } } catch (IOException e) { throw new RuntimeException("Could not query by qualified name.", e); } finally { try { reader.close(); } catch (IOException e) { throw new RuntimeException("Could not close index reader.", e); } } }
From source file:org.archive.nutchwax.tools.DateAdder.java
License:LGPL
public int run(String[] args) throws Exception { if (args.length < 4) { System.out.println("DateAdder <key-index> <source1> ... <sourceN> <dest> <records>"); System.exit(0);/*from w w w .j a v a2 s . c o m*/ } String mainIndexDir = args[0].trim(); String destIndexDir = args[args.length - 2].trim(); String recordsFile = args[args.length - 1].trim(); InputStream recordsStream; if ("-".equals(recordsFile)) { recordsStream = System.in; } else { recordsStream = new FileInputStream(recordsFile); } // Read date-addition records from stdin. Map<String, String> dateRecords = new HashMap<String, String>(); BufferedReader br = new BufferedReader(new InputStreamReader(recordsStream, "UTF-8")); String line; while ((line = br.readLine()) != null) { String fields[] = line.split("\\s+"); if (fields.length < 3) { System.out.println("Malformed line, not enough fields (" + fields.length + "): " + line); continue; } // Key is hash+url, value is String which is a " "-separated list of dates String key = fields[0] + fields[1]; String dates = dateRecords.get(key); if (dates != null) { dates += " " + fields[2]; dateRecords.put(key, dates); } else { dateRecords.put(key, fields[2]); } } IndexReader reader = IndexReader.open(mainIndexDir); IndexReader sourceReaders[] = new IndexReader[args.length - 3]; for (int i = 0; i < sourceReaders.length; i++) { sourceReaders[i] = IndexReader.open(args[i + 1]); } IndexWriter writer = new IndexWriter(destIndexDir, new WhitespaceAnalyzer(), true); UrlCanonicalizer canonicalizer = getCanonicalizer(this.getConf()); for (int i = 0; i < reader.numDocs(); i++) { Document oldDoc = reader.document(i); Document newDoc = new Document(); // Copy the values from all the source indices to the new // document. Set<String> uniqueDates = new HashSet<String>(); for (IndexReader source : sourceReaders) { Document sourceDoc = source.document(i); String dates[] = sourceDoc.getValues(NutchWax.DATE_KEY); Collections.addAll(uniqueDates, dates); } for (String date : uniqueDates) { newDoc.add(new Field(NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED)); } // Obtain the new dates for the document. String newDates = null; try { // First, apply URL canonicalization from Wayback String canonicalizedUrl = canonicalizer.urlStringToKey(oldDoc.get(NutchWax.URL_KEY)); // Now, get the digest+URL of the document, look for it in // the updateRecords and if found, add the date. String key = canonicalizedUrl + oldDoc.get(NutchWax.DIGEST_KEY); newDates = dateRecords.get(key); } catch (Exception e) { // The canonicalizer can throw various types of exceptions // due to malformed URIs. System.err.println("WARN: Not adding dates on malformed URI: " + oldDoc.get(NutchWax.URL_KEY)); } // If there are any new dates, add them to the new document. if (newDates != null) { for (String date : newDates.split("\\s+")) { newDoc.add(new Field(NutchWax.DATE_KEY, date, Field.Store.YES, Field.Index.UN_TOKENIZED)); } } // Finally, add the new document to the new index. writer.addDocument(newDoc); } reader.close(); writer.close(); return 0; }
From source file:org.archive.nutchwax.tools.DumpParallelIndex.java
License:LGPL
private static void dumpIndex(IndexReader reader, String fieldName) throws Exception { Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL); if (!fieldNames.contains(fieldName)) { System.out.println("Field not in index: " + fieldName); System.exit(2);//from w w w. j a v a 2 s .c om } int numDocs = reader.numDocs(); for (int i = 0; i < numDocs; i++) { System.out.println(Arrays.toString(reader.document(i).getValues((String) fieldName))); } }
From source file:org.archive.nutchwax.tools.DumpParallelIndex.java
License:LGPL
private static void dumpIndex(IndexReader reader) throws Exception { Object[] fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL).toArray(); Arrays.sort(fieldNames);//from w ww . ja v a2 s . c o m for (int i = 0; i < fieldNames.length; i++) { System.out.print(fieldNames[i] + "\t"); } System.out.println(); int numDocs = reader.numDocs(); for (int i = 0; i < numDocs; i++) { for (int j = 0; j < fieldNames.length; j++) { System.out.print(Arrays.toString(reader.document(i).getValues((String) fieldNames[j])) + "\t"); } System.out.println(); } }
From source file:org.archive.nutchwax.tools.GetUniqFieldValues.java
License:LGPL
private static void dumpUniqValues(String fieldName, String indexDir) throws Exception { IndexReader reader = IndexReader.open(indexDir); Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL); if (!fieldNames.contains(fieldName)) { System.out.println("Field not in index: " + fieldName); System.exit(2);//from ww w .j av a2 s. c om } int numDocs = reader.numDocs(); Set<String> values = new HashSet<String>(); for (int i = 0; i < numDocs; i++) { values.add(reader.document(i).get(fieldName)); } for (String v : values) { System.out.println(v); } }
From source file:org.archive.nutchwax.tools.LengthNormUpdater.java
License:Apache License
/** * *///from w w w . j ava 2s . c o m public static void reSetNorms(IndexReader reader, String fieldName, Map<String, Integer> ranks, Similarity sim) throws IOException { if (VERBOSE > 0) System.out.println("Updating field: " + fieldName); int[] termCounts = new int[0]; TermEnum termEnum = null; TermDocs termDocs = null; termCounts = new int[reader.maxDoc()]; try { termEnum = reader.terms(new Term(fieldName, "")); try { termDocs = reader.termDocs(); do { Term term = termEnum.term(); if (term != null && term.field().equals(fieldName)) { termDocs.seek(termEnum.term()); while (termDocs.next()) { termCounts[termDocs.doc()] += termDocs.freq(); } } } while (termEnum.next()); } finally { if (null != termDocs) termDocs.close(); } } finally { if (null != termEnum) termEnum.close(); } for (int d = 0; d < termCounts.length; d++) { if (!reader.isDeleted(d)) { Document doc = reader.document(d); String url = doc.get("url"); if (url != null) { Integer rank = ranks.get(url); if (rank == null) continue; float originalNorm = sim.lengthNorm(fieldName, termCounts[d]); byte encodedOrig = sim.encodeNorm(originalNorm); float rankedNorm = originalNorm * (float) (Math.log10(rank) + 1); byte encodedRank = sim.encodeNorm(rankedNorm); if (VERBOSE > 1) System.out.println(fieldName + "\t" + d + "\t" + originalNorm + "\t" + encodedOrig + "\t" + rankedNorm + "\t" + encodedRank); reader.setNorm(d, fieldName, encodedRank); } } } }
From source file:org.archive.tnh.tools.IndexDumper.java
License:Apache License
private static void dumpIndex(IndexReader reader, List<String> fields, boolean includeDocIds) throws Exception { Collection fieldNames = reader.getFieldNames(IndexReader.FieldOption.ALL); // If no fields were specified, then dump them all. if (fields.size() == 0) { fields.addAll(fieldNames);/*from w w w. j a v a 2s . co m*/ } else { for (String field : fields) { if (!fieldNames.contains(field)) { System.out.println("Field not in index: " + field); System.exit(2); } } } int numDocs = reader.numDocs(); for (int i = 0; i < numDocs; i++) { if (includeDocIds) { System.out.print(i + "\t"); } for (String field : fields) { System.out.print(Arrays.toString(reader.document(i).getValues(field))); System.out.print("\t"); } System.out.println(); } }
From source file:org.archive.tnh.tools.LengthNormUpdater.java
License:Apache License
/** * *//*from w w w .j a v a 2s . co m*/ public static void updateNorms(IndexReader reader, String fieldName, Map<String, Integer> ranks, Similarity sim) throws IOException { if (VERBOSE > 0) System.out.println("Updating field: " + fieldName); int[] termCounts = new int[0]; TermEnum termEnum = null; TermDocs termDocs = null; termCounts = new int[reader.maxDoc()]; try { termEnum = reader.terms(new Term(fieldName, "")); try { termDocs = reader.termDocs(); do { Term term = termEnum.term(); if (term != null && term.field().equals(fieldName)) { termDocs.seek(termEnum.term()); while (termDocs.next()) { termCounts[termDocs.doc()] += termDocs.freq(); } } } while (termEnum.next()); } finally { if (null != termDocs) termDocs.close(); } } finally { if (null != termEnum) termEnum.close(); } for (int d = 0; d < termCounts.length; d++) { if (!reader.isDeleted(d)) { Document doc = reader.document(d); String url = doc.get("url"); if (url != null) { Integer rank = ranks.get(url); if (rank == null) continue; float originalNorm = sim.lengthNorm(fieldName, termCounts[d]); byte encodedOrig = sim.encodeNorm(originalNorm); float rankedNorm = originalNorm * (float) (Math.log10(rank) + 1); byte encodedRank = sim.encodeNorm(rankedNorm); if (VERBOSE > 1) System.out.println(fieldName + "\t" + d + "\t" + originalNorm + "\t" + encodedOrig + "\t" + rankedNorm + "\t" + encodedRank); reader.setNorm(d, fieldName, encodedRank); } } } }
From source file:org.archive.tnh.tools.TermDumper.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("TermDumper [-c|-v value] field <index...>"); System.exit(1);//from w w w.ja v a 2s .c om } boolean count = false; String value = null; boolean all = false; int i = 0; for (; i < args.length; i++) { String arg = args[i]; if ("-h".equals(arg) || "--help".equals(arg)) { System.err.println("TermDumper [-c|-v value] field <index...>"); System.exit(1); } else if ("-c".equals(arg) || "--count".equals(arg)) { count = true; } else if ("-v".equals(arg) || "--vaue".equals(arg)) { value = args[++i]; } else if ("-a".equals(arg) || "--all".equals(arg)) { all = true; } else { break; } } String field = args[i++]; java.util.ArrayList<IndexReader> readers = new java.util.ArrayList<IndexReader>(args.length - 1); for (; i < args.length; i++) { String arg = args[i]; try { IndexReader reader = IndexReader.open(new MMapDirectory(new File(arg)), true); readers.add(reader); } catch (IOException ioe) { System.err.println("Error reading: " + arg); } } for (IndexReader reader : readers) { TermDocs termDocs = reader.termDocs(); TermEnum termEnum = reader.terms(new Term(field)); try { do { Term term = termEnum.term(); if (term == null || !field.equals(term.field())) break; if (value == null) { if (count) { termDocs.seek(termEnum); int c = 0; for (; termDocs.next(); c++) ; System.out.print(c + " "); } System.out.println(term.text()); } else if (value.equals(term.text())) { termDocs.seek(termEnum); while (termDocs.next()) { if (all) { Document d = reader.document(termDocs.doc()); System.out.println(termDocs.doc()); for (Object o : d.getFields()) { Field f = (Field) o; System.out.println(f.name() + " " + d.get(f.name())); } } else { System.out .println(termDocs.doc() + " " + reader.document(termDocs.doc()).get("url")); } } } } while (termEnum.next()); } finally { termDocs.close(); termEnum.close(); } } }
From source file:org.capelin.transaction.utils.TXLuceneRecordImporter.java
License:GNU General Public License
protected int importRecords(IndexReader reader, Session session) throws IOException { CapelinRecord data = null;// w w w . j a va 2 s .c o m int totalDoc = reader.numDocs(); // Read documents for (int i = 0; i < totalDoc; i++) { data = buildRecord(reader.document(i)); if (null != data) session.save(data); if (i % BATCH_SIZE == 0) { session.flush(); // apply changes to indexes session.clear(); // free memory since the queue is processed log.info(i); } } return totalDoc; }