List of usage examples for org.apache.lucene.index IndexReader document
public final Document document(int docID) throws IOException
n
th Document
in this index. From source file:srl.corpus.CorpusExtractor.java
License:Open Source License
private void addTagsToDocument(String docName, Vector<Pair<Entity, SrlMatchRegion>> matches, IndexReader reader, ProgressMonitor monitor, boolean wait) throws IOException, CorruptIndexException { String docNameProper = docName.toLowerCase().split(" ")[0]; Term t = new Term("name", docNameProper); int docNo = Integer.parseInt(docName.split(" ")[1]); TermDocs td = reader.termDocs(t);//from w w w . j ava2s.c o m Document old; while (true) { if (!td.next()) { throw new RuntimeException("Lost document: " + docName); } old = reader.document(td.doc()); String dn = old.getField("name").stringValue(); String[] ss = dn.split(" "); if (dn.matches(".* .*") && ss[0].equals(docNameProper) && Integer.parseInt(ss[1]) == docNo) { break; } } String taggedContents = addEntities(new SrlDocument(old, corpus.processor, false), matches); try { corpus.updateContext(old, old.getField("contents").stringValue(), taggedContents, wait); } catch (Exception x) { x.printStackTrace(); } /* Document newDoc = new Document(); newDoc.add(new Field("name", old.getField("name").stringValue(), Field.Store.YES, Field.Index.TOKENIZED)); newDoc.add(new Field("contents", old.getField("contents").stringValue(), Field.Store.YES, Field.Index.TOKENIZED)); newDoc.add(new Field("uid", old.getField("uid").stringValue(), Field.Store.YES, Field.Index.TOKENIZED)); String taggedContents = addEntities(new SrlDocument(old, corpus.processor, false), matches); newDoc.add(new Field("taggedContents", taggedContents, Field.Store.YES, Field.Index.TOKENIZED)); Term uidT = new Term("uid", old.getField("uid").stringValue()); corpus.indexWriter.updateDocument(uidT, newDoc);*/ }
From source file:srl.corpus.CorpusExtractor.java
License:Open Source License
/** * Extract all the templates from this corpus * @param ruleSets The template rules used for the extraction * @param monitor Used to track the progress of the operation * @param wait If the corpus is being used by another thread, this parameter sets * whether the thread should wait or throw a CorpusConcurrencyException * @throws CorpusConcurrencyException If the corpus is locked and wait is false * @throws java.io.IOException// w w w . j a v a 2 s. c o m */ public void extractTemplates(Collection<RuleSet> ruleSets, ProgressMonitor monitor, boolean wait) throws IOException, CorpusConcurrencyException { if (corpus.isIndexOpen()) { corpus.closeIndex(0); } //corpus.clearTemplateExtractions(); final HashMap<String, List<String>> allMatches = new HashMap<String, List<String>>(); int i = 0; for (RuleSet ruleSet : ruleSets) { int j = 0; for (final Pair<String, Rule> rulePair : ruleSet.rules) { if (monitor != null) { monitor.setMessageVal("Matching rule " + rulePair.first); monitor.setProgressVal((float) (i * ruleSet.rules.size() + j) / (float) ruleSets.size() / (float) ruleSet.rules.size()); } corpus.query(rulePair.second.getCorpusQuery(), new QueryHit() { public void hit(Document d, StopSignal signal) { String name = d.getField("uid").stringValue(); if (allMatches.get(name) == null) { allMatches.put(name, new LinkedList<String>()); } List<String> heads = rulePair.second.getHeads(new SrlDocument(d, corpus.processor, true)); for (String s : heads) allMatches.get(name).add(s); } }); } i++; } long lockID = corpus.reopenIndex(wait); IndexReader reader = null; try { reader = IndexReader.open(corpus.indexWriter.getDirectory()); i = 0; for (Map.Entry<String, List<String>> entry : allMatches.entrySet()) { TermDocs td = reader.termDocs(new Term("uid", entry.getKey())); if (!td.next()) { throw new RuntimeException("Lost Document!"); } Document d = reader.document(td.doc()); if (monitor != null) { monitor.setMessageVal("Updating document " + d.getField("name").stringValue()); monitor.setProgressVal((float) i++ / allMatches.size()); } d.removeFields("extracted"); d.add(new Field("extracted", Strings.join("\n", entry.getValue()), Field.Store.YES, Field.Index.NO)); corpus.indexWriter.updateDocument(new Term("uid", entry.getKey()), d); } } finally { reader.close(); corpus.optimizeIndex(lockID); } if (monitor != null) { monitor.setMessageVal("Template Extraction complete"); monitor.setProgressVal(1.0f); } }
From source file:stroom.index.server.BenchmarkIndex.java
License:Apache License
private void doSearchOnField(final IndexReader[] readers, final String arg) throws IOException { long timeSearching = 0; long searchesDone = 0; long matchesFound = 0; for (int i = 0; i < 10000; i++) { final long startTime = System.currentTimeMillis(); final Query query = new TermQuery(new Term(arg, "user" + getRandomSkewed())); for (final IndexReader reader : readers) { final List<Integer> documentIdList = new ArrayList<>(); final IndexSearcher searcher = new IndexSearcher(reader); searcher.search(query, new SimpleCollector() { private int docBase; @Override//from ww w. j a v a 2 s .c o m protected void doSetNextReader(final LeafReaderContext context) throws IOException { super.doSetNextReader(context); docBase = context.docBase; } @Override public void collect(final int doc) throws IOException { documentIdList.add(docBase + doc); } @Override public boolean needsScores() { return false; } }); for (final Integer docId : documentIdList) { final Document doc = reader.document(docId); final String streamId = doc.get(IndexConstants.STREAM_ID); final String eventId = doc.get(IndexConstants.EVENT_ID); } matchesFound += documentIdList.size(); } timeSearching += System.currentTimeMillis() - startTime; searchesDone++; } LOGGER.info("Performed " + ModelStringUtil.formatCsv(searchesDone) + " searches on arg " + arg + " in " + ModelStringUtil.formatDurationString(timeSearching) + " and found " + ModelStringUtil.formatCsv(matchesFound) + " matches"); }
From source file:stroom.search.server.IndexShardSearcherSimpleClient.java
License:Apache License
@Override public void run() { // Boot up spring final ApplicationContext appContext = new ClassPathXmlApplicationContext( new String[] { "classpath:META-INF/spring/stroomCoreServerContext.xml" }); final Query query = new TermQuery(new Term(searchField, searchValue)); final IndexShardService indexShardService = appContext.getBean(IndexShardService.class); final StreamStore streamStore = appContext.getBean(StreamStore.class); final FindIndexShardCriteria findIndexShardCriteria = new FindIndexShardCriteria(); findIndexShardCriteria.getIndexShardStatusSet().addAll(IndexShard.READABLE_INDEX_SHARD_STATUS); final List<IndexShard> indexShardList = indexShardService.find(findIndexShardCriteria); for (final IndexShard indexShard : indexShardList) { try {/*from www . j a va 2 s . c o m*/ final IndexShardSearcher indexShardSearcher = new IndexShardSearcherImpl(indexShard); System.out.println(""); System.out.println("Searching Index " + IndexShardUtil.getIndexPath(indexShard)); final MaxHitCollector docIdListCollector = new MaxHitCollector(Integer.MAX_VALUE); indexShardSearcher.open(); final IndexReader reader = indexShardSearcher.getReader(); final IndexSearcher searcher = new IndexSearcher(reader); searcher.search(query, docIdListCollector); for (final Integer doc : docIdListCollector.getDocIdList()) { System.out.println("\tFound match " + doc); final Document document = reader.document(doc); for (final IndexableField fieldable : document.getFields()) { System.out.println("\t\t" + fieldable.name() + "=" + fieldable.stringValue()); } final Long streamId = Long.valueOf(document.getField(IndexConstants.STREAM_ID).stringValue()); final Long segment = Long.valueOf(document.getField(IndexConstants.EVENT_ID).stringValue()); // Try and open the stream source - pnly open unlocked ones. final StreamSource streamSource = streamStore.openStreamSource(streamId); if (streamSource != null) { final RASegmentInputStream inputStream = new RASegmentInputStream(streamSource); inputStream.include(segment); System.out.println("\t\t" + StreamUtil.streamToString(inputStream)); streamStore.closeStreamSource(streamSource); } } if (docIdListCollector.getDocIdList().size() == 0) { System.out.println("\tNo Matches"); } System.out.println(""); indexShardSearcher.close(); } catch (final Exception ex) { ex.printStackTrace(); } } }
From source file:stroom.search.server.shard.IndexShardSearcherSimpleClient.java
License:Apache License
@Override public void run() { // Boot up spring final ApplicationContext appContext = new ClassPathXmlApplicationContext( new String[] { "classpath:META-INF/spring/stroomCoreServerContext.xml" }); final Query query = new TermQuery(new Term(searchField, searchValue)); final IndexShardService indexShardService = appContext.getBean(IndexShardService.class); final StreamStore streamStore = appContext.getBean(StreamStore.class); final FindIndexShardCriteria findIndexShardCriteria = new FindIndexShardCriteria(); findIndexShardCriteria.getIndexShardStatusSet().addAll(IndexShard.READABLE_INDEX_SHARD_STATUS); final List<IndexShard> indexShardList = indexShardService.find(findIndexShardCriteria); for (final IndexShard indexShard : indexShardList) { try {/* ww w .j a v a 2 s . c o m*/ final IndexShardSearcher indexShardSearcher = new IndexShardSearcherImpl(indexShard); System.out.println(""); System.out.println("Searching Index " + IndexShardUtil.getIndexPath(indexShard)); final MaxHitCollector docIdListCollector = new MaxHitCollector(Integer.MAX_VALUE); final IndexReader reader = indexShardSearcher.getReader(); final IndexSearcher searcher = new IndexSearcher(reader); searcher.search(query, docIdListCollector); for (final Integer doc : docIdListCollector.getDocIdList()) { System.out.println("\tFound match " + doc); final Document document = reader.document(doc); for (final IndexableField fieldable : document.getFields()) { System.out.println("\t\t" + fieldable.name() + "=" + fieldable.stringValue()); } final Long streamId = Long.valueOf(document.getField(IndexConstants.STREAM_ID).stringValue()); final Long segment = Long.valueOf(document.getField(IndexConstants.EVENT_ID).stringValue()); // Try and open the stream source - pnly open unlocked ones. final StreamSource streamSource = streamStore.openStreamSource(streamId); if (streamSource != null) { final RASegmentInputStream inputStream = new RASegmentInputStream(streamSource); inputStream.include(segment); System.out.println("\t\t" + StreamUtil.streamToString(inputStream)); streamStore.closeStreamSource(streamSource); } } if (docIdListCollector.getDocIdList().size() == 0) { System.out.println("\tNo Matches"); } System.out.println(""); indexShardSearcher.destroy(); } catch (final Exception ex) { ex.printStackTrace(); } } }
From source file:stroom.search.server.shard.IndexShardSearchTaskHandler.java
License:Apache License
/** * This method takes a list of document id's and extracts the stored fields * that are required for data display. In some cases such as batch search we * only want to get stream and event ids, in these cases no values are * retrieved, only stream and event ids. *///from w w w. ja v a 2 s. c o m private void getStoredData(final IndexShardSearchTask task, final IndexReader reader, final int docId) { final String[] fieldNames = task.getFieldNames(); try { final Document document = reader.document(docId); String[] values = null; for (int i = 0; i < fieldNames.length; i++) { final String storedField = fieldNames[i]; final IndexableField indexableField = document.getField(storedField); // If the field is not in fact stored then it will be null here. if (indexableField != null) { final String value = indexableField.stringValue(); if (value != null) { final String trimmed = value.trim(); if (trimmed.length() > 0) { if (values == null) { values = new String[fieldNames.length]; } values[i] = trimmed; } } } } if (values != null) { task.getResultReceiver().receive(task.getIndexShardId(), values); } } catch (final Exception e) { error(task, e.getMessage(), e); } }
From source file:titli.model.index.Indexer.java
License:BSD License
/** * index the given table//from www .java2s . c o m * @param table the table to be indexed * @throws TitliException if problems occur * */ private void indexTable(Table table) throws TitliException { //long start = new Date().getTime(); File tableIndexDir = IndexUtility.getIndexDirectoryForTable(table.getDatabaseName(), table.getName()); String query = null; try { //RAMDirectory does not have a method to flush to the hard disk ! this is bad ! //RAMDirectory indexDir = new RAMDirectory(tableIndexDir); Directory dir = FSDirectory.getDirectory(tableIndexDir, true); // specify the index directory IndexWriter indexWriter = new IndexWriter(dir, new StandardAnalyzer(), true); indexWriter.setMergeFactor(TitliConstants.INDEX_MERGE_FACTOR); indexWriter.setMaxBufferedDocs(TitliConstants.INDEX_MAX_BUFFERED_DOCS); //System.out.println("executing : "+"SELECT * FROM "+table.getName()); query = getExtendedQuery(table); ResultSet rs = indexstmt.executeQuery(query); while (rs.next()) { //this is for compatibility with Nutch Parsers //RDBMSRecordParser parser = new RDBMSRecordParser(rs); //String content = parser.getParse(new Content()).getText(); //indexWriter.addDocument(makeDocument(rs, table)); makeDocument(rs, table, indexWriter); } Set<String> keySet = documentMap.keySet(); Iterator<String> iterator = keySet.iterator(); if (iterator.hasNext()) { String keyString = iterator.next(); Map documentValueMap = documentMap.get(keyString); Document document = (Document) documentValueMap.get(TITLIDOC); indexWriter.addDocument(document); } indexWriter.optimize(); indexWriter.close(); dir.close(); rs.close(); IndexReader reader = null; try { reader = IndexReader.open(tableIndexDir); } catch (IOException e) { //throw new TitliIndexRefresherException("TITLI_S_030", "problem while creating index reader for database :"+identifier.getDbName()+" table : "+identifier.getTableName(), e); } int maxDoc = reader.maxDoc(); Document doc = null; int i; //find the doc with given columns and values for (i = 0; i < maxDoc; i++) { try { doc = reader.document(i); } catch (IOException e) { //throw new TitliIndexRefresherException("TITLI_S_030", "problem reading document from the index reader for database :"+identifier.getDbName()+" table : "+identifier.getTableName(), e); } } } catch (IOException e) { throw new TitliIndexException("TITLI_S_009", "I/O problem with " + tableIndexDir, e); } catch (SQLException e) { throw new TitliIndexException("TITLI_S_010", "SQL problem while executing " + query, e); } }
From source file:titli.model.index.IndexRefresher.java
License:BSD License
/** * check if a record with given unique key values already in the index * @param identifier the record identifier * @return true if this record is already indexed otherwise false * @throws TitliException if problems occur *///from w w w .j a v a2s . c o m public boolean isIndexed(RecordIdentifier identifier) throws TitliException { boolean isIndexed = false; File indexDir = IndexUtility.getIndexDirectoryForTable(identifier.getDbName(), identifier.getTableName()); IndexReader reader; try { FSDirectory dir = FSDirectory.getDirectory(indexDir, false); reader = IndexReader.open(dir); } catch (IOException e) { throw new TitliIndexRefresherException("TITLI_S_030", "problem while creating index reader for database :" + identifier.getDbName() + " table : " + identifier.getTableName(), e); } int maxDoc = reader.maxDoc(); Document doc = null; int i; //find the doc with given columns and values for (i = 0; i < maxDoc; i++) { try { //ignore documents marked deleted if (reader.isDeleted(i)) { continue; } doc = reader.document(i); } catch (IOException e) { throw new TitliIndexRefresherException("TITLI_S_030", "problem reading document from the index reader for database :" + identifier.getDbName() + " table : " + identifier.getTableName(), e); } //this is not the doc we are looking for if (identifier.matches(doc)) { isIndexed = true; break; } } try { reader.close(); } catch (IOException e) { throw new TitliIndexRefresherException("TITLI_S_030", "problem closing reader for database :" + identifier.getDbName() + " table : " + identifier.getTableName(), e); } return isIndexed; }
From source file:uk.ac.ebi.bioinvindex.search.hibernatesearch.SecureStudyFreeTextSearch.java
License:Creative Commons License
private void browse(BIIFilterQuery filterQuery, SearchFactory searchFactory, DirectoryProvider directoryProvider, List<Map<StudyBrowseField, String[]>> answer) { ReaderProvider readerProvider = searchFactory.getReaderProvider(); IndexReader reader = readerProvider.openReader(directoryProvider); try {/* w w w . j ava2s .c o m*/ if (filterQuery.getFilters().size() > 0) { Filter filter = queryBuilder.buildFilter(filterQuery); DocIdSet docIdSet = filter.getDocIdSet(reader); DocIdSetIterator iterator = docIdSet.iterator(); while (iterator.next()) { int i = iterator.doc(); if (reader.isDeleted(i)) continue; Document document = reader.document(i); processDocument(answer, document); } } else { for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) continue; Document document = reader.document(i); processDocument(answer, document); } } } catch (IOException e) { log.error("Cannot open index ", e); throw new SearchException("Cannot open index " + e.getMessage(), e); } finally { readerProvider.closeReader(reader); } }
From source file:uk.ac.ebi.bioinvindex.search.hibernatesearch.StudyFreeTextSearchImpl.java
License:Creative Commons License
private void browse(BIIFilterQuery filterQuery, SearchFactory searchFactory, DirectoryProvider directoryProvider, List<Map<StudyBrowseField, String[]>> answer) { ReaderProvider readerProvider = searchFactory.getReaderProvider(); IndexReader reader = readerProvider.openReader(directoryProvider); try {/*from w ww.j av a2 s. c o m*/ if (filterQuery.getFilters().size() > 0) { Filter filter = queryBuilder.buildFilter(filterQuery); DocIdSet docIdSet = filter.getDocIdSet(reader); DocIdSetIterator iterator = docIdSet.iterator(); while (iterator.next()) { int i = iterator.doc(); if (reader.isDeleted(i)) continue; Document document = reader.document(i); processDocument(answer, document); } } else { for (int i = 0; i < reader.maxDoc(); i++) { if (reader.isDeleted(i)) continue; Document document = reader.document(i); processDocument(answer, document); } } } catch (IOException e) { e.printStackTrace(); } finally { readerProvider.closeReader(reader); } }