List of usage examples for org.apache.lucene.index DirectoryReader open
public static DirectoryReader open(final IndexCommit commit) throws IOException
From source file:de.dkt.eservices.elucene.indexmanagement.SearchFiles.java
License:Apache License
/** * Searches a query against a field of an index and return hitsToReturn documents. * @param index index where to search for the query text * @param field document field against what to match the query * @param queryString text of the input query * @param hitsToReturn number of documents to be returned * @return JSON format string containing the results information and content * @throws ExternalServiceFailedException */// w w w .j a v a 2 s . com public static JSONObject search(String index, String sFields, String sAnalyzers, String queryType, String queryString, String language, int hitsToReturn) throws ExternalServiceFailedException { try { // System.out.println(index+"__"+sFields+"__"+sAnalyzers+"__"+queryType+"__"+language+"__"+hitsToReturn); // System.out.println(indexDirectory); Date start = new Date(); File f = FileFactory.generateFileInstance(indexDirectory + index); if (f == null || !f.exists()) { throw new ExternalServiceFailedException( "Specified index [" + indexDirectory + index + "] does not exists."); } logger.info("Searching in folder: " + f.getAbsolutePath()); Directory dir = FSDirectory.open(f); IndexReader reader = DirectoryReader.open(dir); IndexSearcher searcher = new IndexSearcher(reader); // System.out.println(reader.docFreq(new Term("content", "madrid"))); Document doc = reader.document(0); // System.out.println(reader.numDocs()); // System.out.println(doc); String[] fields = sFields.split(";"); String[] analyzers = sAnalyzers.split(";"); if (fields.length != analyzers.length) { logger.error("The number of fields and analyzers is different"); throw new BadRequestException("The number of fields and analyzers is different"); } //System.out.println("CHECK IF THE QUERY IS WORKING PROPERLY: "+queryString); Query query = OwnQueryParser.parseQuery(queryType, queryString, fields, analyzers, language); //System.out.println("\t QUERY: "+query); TopDocs results = searcher.search(query, hitsToReturn); Explanation exp = searcher.explain(query, 0); // System.out.println("EXPLANATION: "+exp); // System.out.println("TOTAL HITS: " + results.totalHits); Date end = new Date(); logger.info("Time: " + (end.getTime() - start.getTime()) + "ms"); // System.out.println("Time: "+(end.getTime()-start.getTime())+"ms"); JSONObject resultModel = JSONLuceneResultConverter.convertResults(query, searcher, results); reader.close(); return resultModel; } catch (IOException e) { e.printStackTrace(); throw new ExternalServiceFailedException("IOException with message: " + e.getMessage()); } }
From source file:de.elbe5.cms.search.SearchBean.java
License:Open Source License
public void searchContent(ContentSearchResultData result) { result.getResults().clear();/*ww w . j av a 2 s . c o m*/ String[] fieldNames = result.getFieldNames(); ScoreDoc[] hits = null; float maxScore = 0f; try { String indexPath = ApplicationPath.getAppPath() + "contentindex"; ensureDirectory(indexPath); IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(); MultiFieldQueryParser parser = new MultiFieldQueryParser(fieldNames, analyzer); String pattern = result.getPattern(); pattern = pattern.trim(); Query query = null; if (pattern.length() != 0) { query = parser.parse(pattern); //Log.log("Searching for: " + query.toString()); TopDocs topDocs = searcher.search(query, result.getMaxSearchResults()); hits = topDocs.scoreDocs; maxScore = topDocs.getMaxScore(); } if (hits != null) { for (ScoreDoc hit : hits) { Document doc = searcher.doc(hit.doc); ContentSearchData data = null; String type = doc.get("type"); switch (type) { case SiteSearchData.TYPE: data = new SiteSearchData(); break; case PageSearchData.TYPE: data = new PageSearchData(); break; case FileSearchData.TYPE: data = new FileSearchData(); break; } assert (data != null); data.setDoc(doc); data.setScore(maxScore <= 1f ? hit.score : hit.score / maxScore); data.evaluateDoc(); data.setContexts(query, analyzer); result.getResults().add(data); } } reader.close(); } catch (Exception ignore) { } }
From source file:de.elbe5.cms.search.SearchBean.java
License:Open Source License
public void searchUsers(UserSearchResultData result) { result.getResults().clear();// w w w. ja v a 2 s. c om String[] fieldNames = result.getFieldNames(); ScoreDoc[] hits = null; float maxScore = 0f; try { String indexPath = ApplicationPath.getAppPath() + "userindex"; ensureDirectory(indexPath); IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(); MultiFieldQueryParser parser = new MultiFieldQueryParser(fieldNames, analyzer); String pattern = result.getPattern(); pattern = pattern.trim(); Query query = null; if (pattern.length() != 0) { query = parser.parse(pattern); //Log.log("Searching for: " + query.toString()); TopDocs topDocs = searcher.search(query, result.getMaxSearchResults()); hits = topDocs.scoreDocs; maxScore = topDocs.getMaxScore(); } if (hits != null) { for (ScoreDoc hit : hits) { Document doc = searcher.doc(hit.doc); UserSearchData data = new UserSearchData(); data.setDoc(doc); data.setScore(maxScore <= 1f ? hit.score : hit.score / maxScore); data.evaluateDoc(); data.setContexts(query, analyzer); result.getResults().add(data); } } reader.close(); } catch (Exception ignore) { } }
From source file:de.hsmannheim.ss15.alr.searchengine.DefaultLuceneController.java
public List<StoredDocument> doSearch(String queryString) throws IOException, ParseException { String field = "contents"; String queries = null;//from w w w . ja va2 s .c om boolean raw = false; int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(indexDir))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(); BufferedReader in = null; if (queries != null) { in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8); } else { in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); } QueryParser parser = new QueryParser(field, analyzer); Query query = parser.parse(queryString); Highlighter highlighter = new Highlighter(new QueryScorer(query)); TotalHitCountCollector collector = new TotalHitCountCollector(); searcher.search(query, collector); TopDocs topDocs = searcher.search(query, Math.max(1, collector.getTotalHits())); List<StoredDocument> results = new ArrayList<>(); for (ScoreDoc scoreDoc : topDocs.scoreDocs) { StoredDocument doc = searcher.doc(scoreDoc.doc); try { File file = new File(doc.get("path")); BufferedReader docReader = new BufferedReader( new InputStreamReader(Files.newInputStream(file.toPath()), StandardCharsets.UTF_8)); List<String> lines = new ArrayList<>(); while (docReader.ready()) { lines.add(docReader.readLine()); } lines.remove(0); lines.remove(0); lines.remove(0); String content = ""; for (String s : lines) { content = content + s; } String highLight = highlighter.getBestFragment(analyzer, null, content); if (highLight == null) { LOGGER.warn("No Highlight found"); } else { doc.add(new TextField("highlight", highLight, Field.Store.YES)); } } catch (InvalidTokenOffsetsException ex) { LOGGER.warn("No Highlight found"); } results.add(doc); } reader.close(); return results; }
From source file:de.ks.lucene.LuceneTaggingTest.java
License:Apache License
@Test public void testTags() throws Exception { IndexWriter writer = new IndexWriter(directory, new IndexWriterConfig(analyzer)); List<String> allTags = Arrays.asList("Bla Blubb", "Blubb", "Blubber Huhu", "Bla Huhu", "Haha"); for (String tag : allTags) { Document doc = new Document(); doc.add(new TextField("tags", tag, Field.Store.YES)); writer.addDocument(doc);/* w w w . ja v a 2 s . c o m*/ } writer.close(); DirectoryReader directoryReader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(directoryReader); String term = "blubb"; TermQuery termQuery = new TermQuery(new Term("tags", term)); TopDocs search = searcher.search(termQuery, 50); log("TermQuery", searcher, search); FuzzyQuery fuzzyQuery = new FuzzyQuery(new Term("tags", term)); search = searcher.search(fuzzyQuery, 50); log("FuzzyQuery", searcher, search); BooleanQuery.Builder builder = new BooleanQuery.Builder(); builder.add(new TermQuery(new Term("tags", "blubb")), BooleanClause.Occur.SHOULD); builder.add(new TermQuery(new Term("tags", "bla")), BooleanClause.Occur.SHOULD); BooleanQuery query = builder.build(); search = searcher.search(query, 50); log("BooleanQuery", searcher, search); }
From source file:de.minecrawler.search.AbstractSearchEngine.java
License:Open Source License
/** * Starts a search on the parsed documents using a search query. * // w ww.j a va 2s. c o m * @param queryString * The query string <a href= * "http://lucene.apache.org/core/4_1_0/queryparser/org/apache/lucene/queryparser/classic/package-summary.html" * >Query Format</a> * @param limit * The maximum numer of results * @return List of results */ public List<CrawledWebsiteResult> search(String queryString, int limit) { try { DirectoryReader ireader = DirectoryReader.open(this.dir); IndexSearcher isearcher = new IndexSearcher(ireader); QueryParser parser = new MultiFieldQueryParser(LUCENE_VERSION, FIELDS, ANALYZER); Query query = parser.parse(queryString); ScoreDoc[] hits = isearcher.search(query, null, limit).scoreDocs; List<CrawledWebsiteResult> result = new ArrayList<CrawledWebsiteResult>(); for (int i = 0; i < hits.length; ++i) { Document hitDoc = isearcher.doc(hits[i].doc); CrawledWebsite website = extractWebsite(hitDoc); result.add(new CrawledWebsiteResult(website, i + 1, hits[i].score)); } ireader.close(); return result; } catch (IOException e) { e.printStackTrace(); return Collections.<CrawledWebsiteResult>emptyList(); } catch (ParseException e) { System.out.println("Wrong query! Check your query format!"); System.out.println(e.getMessage()); return Collections.<CrawledWebsiteResult>emptyList(); } }
From source file:de.qaware.chronix.lucene.client.LuceneIndex.java
License:Apache License
/** * Closes the index writer if it is open. * Then opens the index reader./* ww w .j a v a2 s. c om*/ * * @return an open lucene reader. * @throws IOException if the lucene reader can not be opened or created */ public IndexReader getOpenReader() throws IOException { if (writerOpen()) { LOGGER.debug("Closing writer"); writer.close(); } if (readerClosed()) { LOGGER.debug("Opening reader"); reader = DirectoryReader.open(directory); } return reader; }
From source file:de.tuberlin.dima.cuttlefish.preprocessing.vectorization.Vectorizer.java
License:Open Source License
public void vectorize(File luceneIndexDir, File outputDir) throws Exception { Configuration conf = new Configuration(); FileSystem fs = FileSystem.getLocal(conf); SequenceFile.Writer writer = null; FeatureDictionary dict = new FeatureDictionary(); DirectoryReader reader = null;//from w w w . ja v a2s . c o m try { reader = DirectoryReader.open(new SimpleFSDirectory(luceneIndexDir)); writer = SequenceFile.createWriter(fs, conf, new Path(outputDir.toString(), "documentVectors.seq"), IDAndCodes.class, VectorWritable.class); IDAndCodes idAndCodes = new IDAndCodes(); VectorWritable vectorWritable = new VectorWritable(); Fields fields = MultiFields.getFields(reader); if (fields != null) { Iterator<String> fieldNames = fields.iterator(); while (fieldNames.hasNext()) { String field = fieldNames.next(); if (!field.startsWith("bip:") && !"itemID".equals(field)) { Terms terms = fields.terms(field); TermsEnum termsEnum = terms.iterator(null); BytesRef text; while ((text = termsEnum.next()) != null) { dict.addTextFeature(field, text.utf8ToString()); } } } } int numDocsVectorized = 0; for (int docID = 0; docID < reader.maxDoc(); docID++) { Document doc = reader.document(docID); int itemID = doc.getField("itemID").numericValue().intValue(); RandomAccessSparseVector documentVector = new RandomAccessSparseVector(dict.numFeatures()); Multimap<String, String> codes = HashMultimap.create(); for (IndexableField field : doc.getFields()) { String fieldName = field.name(); if (!fieldName.startsWith("bip:") && !"itemID".equals(fieldName)) { Terms termFreqVector = reader.getTermVector(docID, fieldName); if (termFreqVector != null) { int maxTermFrequency = maxTermFrequency(termFreqVector); TermsEnum te = termFreqVector.iterator(null); BytesRef term; while ((term = te.next()) != null) { String termStr = term.utf8ToString(); int termFrequency = (int) te.totalTermFreq(); int documentFrequency = reader.docFreq(new Term(fieldName, term)); int numDocs = reader.numDocs(); double weight = weighting.weight(fieldName, termStr, termFrequency, documentFrequency, maxTermFrequency, numDocs); int featureIndex = dict.index(fieldName, term.utf8ToString()); documentVector.setQuick(featureIndex, weight); } } } else if (fieldName.startsWith("bip:")) { for (String value : doc.getValues(fieldName)) { codes.put(fieldName, value); } } } Vector featureVector = new SequentialAccessSparseVector(documentVector); weighting.normalize(featureVector); idAndCodes.set(itemID, codes); vectorWritable.set(featureVector); writer.append(idAndCodes, vectorWritable); numDocsVectorized++; if (numDocsVectorized % 100 == 0) { log.info("Vectorized {} documents", numDocsVectorized); } } log.info("Vectorized {} documents", numDocsVectorized); dict.writeToFile(new File(outputDir, "features.txt")); log.info("Wrote feature dictionary"); } finally { Closeables.close(reader, true); Closeables.close(writer, true); } }
From source file:de.tudarmstadt.lt.lm.lucenebased.CountingStringLM.java
License:Apache License
public CountingStringLM(int order, File index_dir) { _order = order;/*from w ww . j a v a 2s.co m*/ try { LOG.info("Loading index from or creating index in '{}'.", index_dir.getAbsolutePath()); File index_dir_vocab = new File(index_dir, "vocab"); File index_dir_ngram = new File(index_dir, "ngram"); _fixed = true; Directory directory = MMapDirectory.open(index_dir_ngram); // directory = new RAMDirectory(directory, IOContext.DEFAULT); _reader_ngram = DirectoryReader.open(directory); _searcher_ngram = new IndexSearcher(_reader_ngram); directory = MMapDirectory.open(index_dir_vocab); // directory = new RAMDirectory(directory, IOContext.DEFAULT); _reader_vocab = DirectoryReader.open(directory); _searcher_vocab = new IndexSearcher(_reader_vocab); LOG.info("Computing number of ngram occurrences."); File sumfile = new File(index_dir, "__sum_ngrams__"); try { InputStream in = new FileInputStream(sumfile); Properties p = new Properties(); p.load(in); in.close(); int max_n = Math.max(_order, Integer.parseInt(p.getProperty("max_n"))); if (max_n < order) LOG.error("max_n={} in {} is smaller than the order of the language model ({}).", max_n, sumfile, order); int max_c = Integer.parseInt(p.getProperty("max_c")); _N = new double[max_n + 1][max_c]; _sum_ngrams = new double[max_n + 1]; for (String name : p.stringPropertyNames()) { if (name.startsWith("n")) { int n = Integer.parseInt(name.substring(1, name.length())); String[] v = p.getProperty(name).split(","); for (int i = 0; i < v.length; i++) { _N[n][i] = Double.parseDouble(v[i]); } } else if (name.startsWith("s")) { int n = Integer.parseInt(name.substring(1, name.length())); _sum_ngrams[n] = Double.parseDouble(p.getProperty(name)); } } } catch (Exception e) { LOG.error("Could not read ngram sum file '{}'.", sumfile, e); _N = new double[order + 1][6]; _sum_ngrams = new double[order + 1]; } _num_ngrams = new double[_N.length][4]; long sum = 0; for (int n = 0; n < _N.length; n++) { for (int i = 0; i < 3; i++) _num_ngrams[n][i] = _N[n][i]; for (int i = 3; i < _N[n].length; i++) _num_ngrams[n][3] += _N[n][i]; sum += _num_ngrams[n][0]; } LOG.info("Number of Ngrams {}.", _searcher_ngram.collectionStatistics("ngram").docCount()); LOG.info("Number of Ngrams {}.", sum); LOG.info("Vocabulary Size {}.", _searcher_vocab.collectionStatistics("word").docCount()); } catch (IOException e) { LOG.error("Could not open lucene index: Dir={}; Dir exists={}; ", index_dir, index_dir.exists() && index_dir.isDirectory(), e); } }
From source file:de.tudarmstadt.ukp.dkpro.tc.features.ngram.base.LuceneFeatureExtractorBase.java
License:Apache License
@Override protected FrequencyDistribution<String> getTopNgrams() throws ResourceInitializationException { FrequencyDistribution<String> topNGrams = new FrequencyDistribution<String>(); MinMaxPriorityQueue<TermFreqTuple> topN = MinMaxPriorityQueue.maximumSize(getTopN()).create(); long ngramVocabularySize = 0; IndexReader reader;// w w w .j a v a 2 s. c om try { reader = DirectoryReader.open(FSDirectory.open(luceneDir)); Fields fields = MultiFields.getFields(reader); if (fields != null) { Terms terms = fields.terms(getFieldName()); if (terms != null) { TermsEnum termsEnum = terms.iterator(null); BytesRef text = null; while ((text = termsEnum.next()) != null) { String term = text.utf8ToString(); long freq = termsEnum.totalTermFreq(); if (passesScreening(term)) { topN.add(new TermFreqTuple(term, freq)); ngramVocabularySize += freq; } } } } } catch (Exception e) { throw new ResourceInitializationException(e); } int size = topN.size(); for (int i = 0; i < size; i++) { TermFreqTuple tuple = topN.poll(); long absCount = tuple.getFreq(); double relFrequency = ((double) absCount) / ngramVocabularySize; if (relFrequency >= ngramFreqThreshold) { topNGrams.addSample(tuple.getTerm(), tuple.getFreq()); } } logSelectionProcess(topNGrams.getB()); return topNGrams; }