Example usage for org.apache.lucene.index IndexReader document

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexReader document.

Prototype




public final Document document(int docID) throws IOException

Source Link

Document

Returns the stored fields of the n^th Document in this index.

Usage

From source file:srl.corpus.CorpusExtractor.java

License:Open Source License

private void addTagsToDocument(String docName, Vector<Pair<Entity, SrlMatchRegion>> matches, IndexReader reader,
        ProgressMonitor monitor, boolean wait) throws IOException, CorruptIndexException {
    String docNameProper = docName.toLowerCase().split(" ")[0];
    Term t = new Term("name", docNameProper);
    int docNo = Integer.parseInt(docName.split(" ")[1]);
    TermDocs td = reader.termDocs(t);//from   w w  w  .  j ava2s.c o m
    Document old;
    while (true) {
        if (!td.next()) {
            throw new RuntimeException("Lost document: " + docName);
        }
        old = reader.document(td.doc());
        String dn = old.getField("name").stringValue();
        String[] ss = dn.split(" ");
        if (dn.matches(".* .*") && ss[0].equals(docNameProper) && Integer.parseInt(ss[1]) == docNo) {
            break;
        }
    }
    String taggedContents = addEntities(new SrlDocument(old, corpus.processor, false), matches);
    try {
        corpus.updateContext(old, old.getField("contents").stringValue(), taggedContents, wait);
    } catch (Exception x) {
        x.printStackTrace();
    }
    /*
            Document newDoc = new Document();
            newDoc.add(new Field("name", old.getField("name").stringValue(), Field.Store.YES, Field.Index.TOKENIZED));
            newDoc.add(new Field("contents", old.getField("contents").stringValue(), Field.Store.YES, Field.Index.TOKENIZED));
            newDoc.add(new Field("uid", old.getField("uid").stringValue(), Field.Store.YES, Field.Index.TOKENIZED));
            String taggedContents = addEntities(new SrlDocument(old, corpus.processor, false), matches);
            newDoc.add(new Field("taggedContents", taggedContents, Field.Store.YES, Field.Index.TOKENIZED));
            Term uidT = new Term("uid", old.getField("uid").stringValue());
            corpus.indexWriter.updateDocument(uidT, newDoc);*/
}

From source file:srl.corpus.CorpusExtractor.java

License:Open Source License

/**
 * Extract all the templates from this corpus
 * @param ruleSets The template rules used for the extraction
 * @param monitor Used to track the progress of the operation
 * @param wait If the corpus is being used by another thread, this parameter sets
 * whether the thread should wait or throw a CorpusConcurrencyException
 * @throws CorpusConcurrencyException If the corpus is locked and wait is false
 * @throws java.io.IOException// w w w .  j a v  a 2  s.  c o m
 */
public void extractTemplates(Collection<RuleSet> ruleSets, ProgressMonitor monitor, boolean wait)
        throws IOException, CorpusConcurrencyException {
    if (corpus.isIndexOpen()) {
        corpus.closeIndex(0);
    }
    //corpus.clearTemplateExtractions();
    final HashMap<String, List<String>> allMatches = new HashMap<String, List<String>>();
    int i = 0;
    for (RuleSet ruleSet : ruleSets) {
        int j = 0;
        for (final Pair<String, Rule> rulePair : ruleSet.rules) {
            if (monitor != null) {
                monitor.setMessageVal("Matching rule " + rulePair.first);
                monitor.setProgressVal((float) (i * ruleSet.rules.size() + j) / (float) ruleSets.size()
                        / (float) ruleSet.rules.size());
            }
            corpus.query(rulePair.second.getCorpusQuery(), new QueryHit() {

                public void hit(Document d, StopSignal signal) {
                    String name = d.getField("uid").stringValue();
                    if (allMatches.get(name) == null) {
                        allMatches.put(name, new LinkedList<String>());
                    }
                    List<String> heads = rulePair.second.getHeads(new SrlDocument(d, corpus.processor, true));
                    for (String s : heads)
                        allMatches.get(name).add(s);
                }
            });
        }
        i++;
    }
    long lockID = corpus.reopenIndex(wait);
    IndexReader reader = null;
    try {
        reader = IndexReader.open(corpus.indexWriter.getDirectory());
        i = 0;
        for (Map.Entry<String, List<String>> entry : allMatches.entrySet()) {

            TermDocs td = reader.termDocs(new Term("uid", entry.getKey()));
            if (!td.next()) {
                throw new RuntimeException("Lost Document!");
            }
            Document d = reader.document(td.doc());
            if (monitor != null) {
                monitor.setMessageVal("Updating document " + d.getField("name").stringValue());
                monitor.setProgressVal((float) i++ / allMatches.size());
            }
            d.removeFields("extracted");
            d.add(new Field("extracted", Strings.join("\n", entry.getValue()), Field.Store.YES,
                    Field.Index.NO));
            corpus.indexWriter.updateDocument(new Term("uid", entry.getKey()), d);
        }
    } finally {
        reader.close();
        corpus.optimizeIndex(lockID);
    }
    if (monitor != null) {
        monitor.setMessageVal("Template Extraction complete");
        monitor.setProgressVal(1.0f);
    }
}

From source file:stroom.index.server.BenchmarkIndex.java

License:Apache License

private void doSearchOnField(final IndexReader[] readers, final String arg) throws IOException {
    long timeSearching = 0;
    long searchesDone = 0;
    long matchesFound = 0;

    for (int i = 0; i < 10000; i++) {
        final long startTime = System.currentTimeMillis();
        final Query query = new TermQuery(new Term(arg, "user" + getRandomSkewed()));

        for (final IndexReader reader : readers) {
            final List<Integer> documentIdList = new ArrayList<>();
            final IndexSearcher searcher = new IndexSearcher(reader);
            searcher.search(query, new SimpleCollector() {
                private int docBase;

                @Override//from   ww  w.  j a v  a 2 s .c o  m
                protected void doSetNextReader(final LeafReaderContext context) throws IOException {
                    super.doSetNextReader(context);
                    docBase = context.docBase;
                }

                @Override
                public void collect(final int doc) throws IOException {
                    documentIdList.add(docBase + doc);
                }

                @Override
                public boolean needsScores() {
                    return false;
                }
            });

            for (final Integer docId : documentIdList) {
                final Document doc = reader.document(docId);
                final String streamId = doc.get(IndexConstants.STREAM_ID);
                final String eventId = doc.get(IndexConstants.EVENT_ID);
            }
            matchesFound += documentIdList.size();
        }

        timeSearching += System.currentTimeMillis() - startTime;
        searchesDone++;

    }
    LOGGER.info("Performed " + ModelStringUtil.formatCsv(searchesDone) + " searches on arg " + arg + " in "
            + ModelStringUtil.formatDurationString(timeSearching) + " and found "
            + ModelStringUtil.formatCsv(matchesFound) + " matches");
}

From source file:stroom.search.server.IndexShardSearcherSimpleClient.java

License:Apache License

@Override
public void run() {
    // Boot up spring
    final ApplicationContext appContext = new ClassPathXmlApplicationContext(
            new String[] { "classpath:META-INF/spring/stroomCoreServerContext.xml" });

    final Query query = new TermQuery(new Term(searchField, searchValue));

    final IndexShardService indexShardService = appContext.getBean(IndexShardService.class);
    final StreamStore streamStore = appContext.getBean(StreamStore.class);

    final FindIndexShardCriteria findIndexShardCriteria = new FindIndexShardCriteria();
    findIndexShardCriteria.getIndexShardStatusSet().addAll(IndexShard.READABLE_INDEX_SHARD_STATUS);
    final List<IndexShard> indexShardList = indexShardService.find(findIndexShardCriteria);

    for (final IndexShard indexShard : indexShardList) {
        try {/*from  www  .  j  a va  2  s . c  o m*/
            final IndexShardSearcher indexShardSearcher = new IndexShardSearcherImpl(indexShard);
            System.out.println("");
            System.out.println("Searching Index " + IndexShardUtil.getIndexPath(indexShard));
            final MaxHitCollector docIdListCollector = new MaxHitCollector(Integer.MAX_VALUE);
            indexShardSearcher.open();
            final IndexReader reader = indexShardSearcher.getReader();
            final IndexSearcher searcher = new IndexSearcher(reader);
            searcher.search(query, docIdListCollector);
            for (final Integer doc : docIdListCollector.getDocIdList()) {
                System.out.println("\tFound match " + doc);
                final Document document = reader.document(doc);
                for (final IndexableField fieldable : document.getFields()) {
                    System.out.println("\t\t" + fieldable.name() + "=" + fieldable.stringValue());
                }

                final Long streamId = Long.valueOf(document.getField(IndexConstants.STREAM_ID).stringValue());
                final Long segment = Long.valueOf(document.getField(IndexConstants.EVENT_ID).stringValue());

                // Try and open the stream source - pnly open unlocked ones.
                final StreamSource streamSource = streamStore.openStreamSource(streamId);
                if (streamSource != null) {
                    final RASegmentInputStream inputStream = new RASegmentInputStream(streamSource);
                    inputStream.include(segment);
                    System.out.println("\t\t" + StreamUtil.streamToString(inputStream));
                    streamStore.closeStreamSource(streamSource);
                }
            }

            if (docIdListCollector.getDocIdList().size() == 0) {
                System.out.println("\tNo Matches");
            }
            System.out.println("");
            indexShardSearcher.close();
        } catch (final Exception ex) {
            ex.printStackTrace();
        }
    }
}

From source file:stroom.search.server.shard.IndexShardSearcherSimpleClient.java

License:Apache License

@Override
public void run() {
    // Boot up spring
    final ApplicationContext appContext = new ClassPathXmlApplicationContext(
            new String[] { "classpath:META-INF/spring/stroomCoreServerContext.xml" });

    final Query query = new TermQuery(new Term(searchField, searchValue));

    final IndexShardService indexShardService = appContext.getBean(IndexShardService.class);
    final StreamStore streamStore = appContext.getBean(StreamStore.class);

    final FindIndexShardCriteria findIndexShardCriteria = new FindIndexShardCriteria();
    findIndexShardCriteria.getIndexShardStatusSet().addAll(IndexShard.READABLE_INDEX_SHARD_STATUS);
    final List<IndexShard> indexShardList = indexShardService.find(findIndexShardCriteria);

    for (final IndexShard indexShard : indexShardList) {
        try {/*  ww w  .j  a  v  a  2 s  .  c  o m*/
            final IndexShardSearcher indexShardSearcher = new IndexShardSearcherImpl(indexShard);
            System.out.println("");
            System.out.println("Searching Index " + IndexShardUtil.getIndexPath(indexShard));
            final MaxHitCollector docIdListCollector = new MaxHitCollector(Integer.MAX_VALUE);
            final IndexReader reader = indexShardSearcher.getReader();
            final IndexSearcher searcher = new IndexSearcher(reader);
            searcher.search(query, docIdListCollector);
            for (final Integer doc : docIdListCollector.getDocIdList()) {
                System.out.println("\tFound match " + doc);
                final Document document = reader.document(doc);
                for (final IndexableField fieldable : document.getFields()) {
                    System.out.println("\t\t" + fieldable.name() + "=" + fieldable.stringValue());
                }

                final Long streamId = Long.valueOf(document.getField(IndexConstants.STREAM_ID).stringValue());
                final Long segment = Long.valueOf(document.getField(IndexConstants.EVENT_ID).stringValue());

                // Try and open the stream source - pnly open unlocked ones.
                final StreamSource streamSource = streamStore.openStreamSource(streamId);
                if (streamSource != null) {
                    final RASegmentInputStream inputStream = new RASegmentInputStream(streamSource);
                    inputStream.include(segment);
                    System.out.println("\t\t" + StreamUtil.streamToString(inputStream));
                    streamStore.closeStreamSource(streamSource);
                }
            }

            if (docIdListCollector.getDocIdList().size() == 0) {
                System.out.println("\tNo Matches");
            }
            System.out.println("");
            indexShardSearcher.destroy();
        } catch (final Exception ex) {
            ex.printStackTrace();
        }
    }
}

From source file:stroom.search.server.shard.IndexShardSearchTaskHandler.java

License:Apache License

/**
 * This method takes a list of document id's and extracts the stored fields
 * that are required for data display. In some cases such as batch search we
 * only want to get stream and event ids, in these cases no values are
 * retrieved, only stream and event ids.
 *///from w  w w.  ja  v a 2  s. c o m
private void getStoredData(final IndexShardSearchTask task, final IndexReader reader, final int docId) {
    final String[] fieldNames = task.getFieldNames();
    try {
        final Document document = reader.document(docId);
        String[] values = null;

        for (int i = 0; i < fieldNames.length; i++) {
            final String storedField = fieldNames[i];
            final IndexableField indexableField = document.getField(storedField);

            // If the field is not in fact stored then it will be null here.
            if (indexableField != null) {
                final String value = indexableField.stringValue();
                if (value != null) {
                    final String trimmed = value.trim();
                    if (trimmed.length() > 0) {
                        if (values == null) {
                            values = new String[fieldNames.length];
                        }
                        values[i] = trimmed;
                    }
                }
            }
        }

        if (values != null) {
            task.getResultReceiver().receive(task.getIndexShardId(), values);
        }
    } catch (final Exception e) {
        error(task, e.getMessage(), e);
    }
}

From source file:titli.model.index.Indexer.java

License:BSD License

/**
 * index the given table//from  www  .java2s .  c  o  m
 * @param table the table to be indexed
 * @throws TitliException if problems occur
 * 
 */
private void indexTable(Table table) throws TitliException {

    //long start = new Date().getTime();

    File tableIndexDir = IndexUtility.getIndexDirectoryForTable(table.getDatabaseName(), table.getName());
    String query = null;

    try {
        //RAMDirectory does not have a method to flush to the hard disk ! this is  bad !
        //RAMDirectory indexDir = new RAMDirectory(tableIndexDir);
        Directory dir = FSDirectory.getDirectory(tableIndexDir, true);

        //   specify the index directory
        IndexWriter indexWriter = new IndexWriter(dir, new StandardAnalyzer(), true);
        indexWriter.setMergeFactor(TitliConstants.INDEX_MERGE_FACTOR);
        indexWriter.setMaxBufferedDocs(TitliConstants.INDEX_MAX_BUFFERED_DOCS);

        //System.out.println("executing :   "+"SELECT * FROM  "+table.getName());

        query = getExtendedQuery(table);

        ResultSet rs = indexstmt.executeQuery(query);

        while (rs.next()) {
            //this is for compatibility with Nutch Parsers
            //RDBMSRecordParser parser = new RDBMSRecordParser(rs);
            //String content = parser.getParse(new Content()).getText();

            //indexWriter.addDocument(makeDocument(rs, table));
            makeDocument(rs, table, indexWriter);
        }

        Set<String> keySet = documentMap.keySet();
        Iterator<String> iterator = keySet.iterator();
        if (iterator.hasNext()) {
            String keyString = iterator.next();
            Map documentValueMap = documentMap.get(keyString);
            Document document = (Document) documentValueMap.get(TITLIDOC);
            indexWriter.addDocument(document);
        }

        indexWriter.optimize();
        indexWriter.close();
        dir.close();

        rs.close();

        IndexReader reader = null;
        try {
            reader = IndexReader.open(tableIndexDir);
        } catch (IOException e) {
            //throw new TitliIndexRefresherException("TITLI_S_030", "problem while creating index reader for database  :"+identifier.getDbName()+" table : "+identifier.getTableName(), e);
        }

        int maxDoc = reader.maxDoc();
        Document doc = null;

        int i;

        //find the doc with given columns and values
        for (i = 0; i < maxDoc; i++) {
            try {
                doc = reader.document(i);
            } catch (IOException e) {
                //throw new TitliIndexRefresherException("TITLI_S_030", "problem reading document from the index reader for database  :"+identifier.getDbName()+" table : "+identifier.getTableName(), e);
            }
        }

    } catch (IOException e) {
        throw new TitliIndexException("TITLI_S_009", "I/O problem with " + tableIndexDir, e);
    } catch (SQLException e) {
        throw new TitliIndexException("TITLI_S_010", "SQL problem while executing " + query, e);
    }

}

From source file:titli.model.index.IndexRefresher.java

License:BSD License

/**
 * check if a record with given unique key values already in the index  
 * @param identifier the record identifier
 * @return true if this record is already indexed otherwise false
 * @throws TitliException if problems occur
 *///from w w  w  .j a v  a2s  .  c o m
public boolean isIndexed(RecordIdentifier identifier) throws TitliException {
    boolean isIndexed = false;
    File indexDir = IndexUtility.getIndexDirectoryForTable(identifier.getDbName(), identifier.getTableName());
    IndexReader reader;

    try {
        FSDirectory dir = FSDirectory.getDirectory(indexDir, false);
        reader = IndexReader.open(dir);
    } catch (IOException e) {
        throw new TitliIndexRefresherException("TITLI_S_030",
                "problem while creating index reader for database  :" + identifier.getDbName() + " table : "
                        + identifier.getTableName(),
                e);
    }

    int maxDoc = reader.maxDoc();
    Document doc = null;
    int i;

    //find the doc with given columns and values
    for (i = 0; i < maxDoc; i++) {
        try {
            //ignore documents marked deleted
            if (reader.isDeleted(i)) {
                continue;
            }

            doc = reader.document(i);
        } catch (IOException e) {
            throw new TitliIndexRefresherException("TITLI_S_030",
                    "problem reading document from the index reader for database  :" + identifier.getDbName()
                            + " table : " + identifier.getTableName(),
                    e);
        }

        //this is not the doc we are looking for
        if (identifier.matches(doc)) {
            isIndexed = true;
            break;
        }

    }

    try {
        reader.close();
    } catch (IOException e) {
        throw new TitliIndexRefresherException("TITLI_S_030", "problem closing reader for database  :"
                + identifier.getDbName() + " table : " + identifier.getTableName(), e);
    }

    return isIndexed;

}

From source file:uk.ac.ebi.bioinvindex.search.hibernatesearch.SecureStudyFreeTextSearch.java

License:Creative Commons License

private void browse(BIIFilterQuery filterQuery, SearchFactory searchFactory,
        DirectoryProvider directoryProvider, List<Map<StudyBrowseField, String[]>> answer) {
    ReaderProvider readerProvider = searchFactory.getReaderProvider();
    IndexReader reader = readerProvider.openReader(directoryProvider);

    try {/* w w  w .  j ava2s .c  o m*/

        if (filterQuery.getFilters().size() > 0) {
            Filter filter = queryBuilder.buildFilter(filterQuery);

            DocIdSet docIdSet = filter.getDocIdSet(reader);

            DocIdSetIterator iterator = docIdSet.iterator();

            while (iterator.next()) {
                int i = iterator.doc();

                if (reader.isDeleted(i))
                    continue;

                Document document = reader.document(i);
                processDocument(answer, document);
            }

        } else {
            for (int i = 0; i < reader.maxDoc(); i++) {
                if (reader.isDeleted(i))
                    continue;

                Document document = reader.document(i);
                processDocument(answer, document);

            }
        }
    } catch (IOException e) {
        log.error("Cannot open index ", e);
        throw new SearchException("Cannot open index " + e.getMessage(), e);
    } finally {
        readerProvider.closeReader(reader);
    }
}

From source file:uk.ac.ebi.bioinvindex.search.hibernatesearch.StudyFreeTextSearchImpl.java

License:Creative Commons License

private void browse(BIIFilterQuery filterQuery, SearchFactory searchFactory,
        DirectoryProvider directoryProvider, List<Map<StudyBrowseField, String[]>> answer) {
    ReaderProvider readerProvider = searchFactory.getReaderProvider();
    IndexReader reader = readerProvider.openReader(directoryProvider);

    try {/*from  w ww.j  av  a2  s.  c o m*/

        if (filterQuery.getFilters().size() > 0) {
            Filter filter = queryBuilder.buildFilter(filterQuery);

            DocIdSet docIdSet = filter.getDocIdSet(reader);

            DocIdSetIterator iterator = docIdSet.iterator();

            while (iterator.next()) {
                int i = iterator.doc();

                if (reader.isDeleted(i))
                    continue;

                Document document = reader.document(i);
                processDocument(answer, document);
            }

        } else {
            for (int i = 0; i < reader.maxDoc(); i++) {
                if (reader.isDeleted(i))
                    continue;

                Document document = reader.document(i);
                processDocument(answer, document);

            }
        }
    } catch (IOException e) {
        e.printStackTrace();
    } finally {
        readerProvider.closeReader(reader);
    }
}