List of usage examples for org.apache.lucene.index DirectoryReader open
public static DirectoryReader open(final IndexCommit commit) throws IOException
From source file:cc.pp.analyzer.ik.demo.IKAnalyzerDemo.java
License:Apache License
public static void main(String[] args) { //Lucene Document?? String fieldName = "text"; //// ww w . ja v a2 s .c o m String text = "IK Analyzer???????"; //IKAnalyzer? Analyzer analyzer = new IKAnalyzer(Version.LUCENE_48, true); Directory directory = null; IndexWriter iwriter = null; DirectoryReader ireader = null; IndexSearcher isearcher = null; try { // directory = new RAMDirectory(); //?IndexWriterConfig IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_48, analyzer); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); iwriter = new IndexWriter(directory, iwConfig); // Document doc = new Document(); doc.add(new LongField("ID", 1000, Field.Store.YES)); doc.add(new TextField(fieldName, text, Field.Store.YES)); iwriter.addDocument(doc); iwriter.close(); //?********************************** //? ireader = DirectoryReader.open(directory); isearcher = new IndexSearcher(ireader); String keyword = "?"; // String keyword = ""; //QueryParser?Query QueryParser qp = new QueryParser(Version.LUCENE_48, fieldName, analyzer); qp.setDefaultOperator(QueryParser.AND_OPERATOR); Query query = qp.parse(keyword); System.out.println("Query = " + query); //?5? TopDocs topDocs = isearcher.search(query, 5); System.out.println("" + topDocs.totalHits); // ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (int i = 0; i < topDocs.totalHits; i++) { Document targetDoc = isearcher.doc(scoreDocs[i].doc); System.out.println("" + targetDoc.toString()); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { e.printStackTrace(); } } if (directory != null) { try { directory.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:cc.twittertools.index.ExtractTermStatisticsFromIndex.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("index").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("min").create(MIN_OPTION)); CommandLine cmdline = null;/*from w w w . jav a2 s.c om*/ CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(ExtractTermStatisticsFromIndex.class.getName(), options); System.exit(-1); } String indexLocation = cmdline.getOptionValue(INDEX_OPTION); int min = cmdline.hasOption(MIN_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MIN_OPTION)) : 1; PrintStream out = new PrintStream(System.out, true, "UTF-8"); IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexLocation))); Terms terms = SlowCompositeReaderWrapper.wrap(reader).terms(StatusField.TEXT.name); TermsEnum termsEnum = terms.iterator(TermsEnum.EMPTY); long missingCnt = 0; int skippedTerms = 0; BytesRef bytes = new BytesRef(); while ((bytes = termsEnum.next()) != null) { byte[] buf = new byte[bytes.length]; System.arraycopy(bytes.bytes, 0, buf, 0, bytes.length); String term = new String(buf, "UTF-8"); int df = termsEnum.docFreq(); long cf = termsEnum.totalTermFreq(); if (df < min) { skippedTerms++; missingCnt += cf; continue; } out.println(term + "\t" + df + "\t" + cf); } reader.close(); out.close(); System.err.println("skipped terms: " + skippedTerms + ", cnt: " + missingCnt); }
From source file:cc.twittertools.index.ExtractTweetidsFromIndex.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(/* w ww . j av a2 s . co m*/ OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(ExtractTweetidsFromIndex.class.getName(), options); System.exit(-1); } File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION)); if (!indexLocation.exists()) { System.err.println("Error: " + indexLocation + " does not exist!"); System.exit(-1); } IndexReader reader = DirectoryReader.open(FSDirectory.open(indexLocation)); PrintStream out = new PrintStream(System.out, true, "UTF-8"); for (int i = 0; i < reader.maxDoc(); i++) { Document doc = reader.document(i); out.println(doc.getField(StatusField.ID.name).stringValue() + "\t" + doc.getField(StatusField.SCREEN_NAME.name).stringValue()); } out.close(); reader.close(); }
From source file:cc.twittertools.search.api.TrecSearchHandler.java
License:Apache License
public TrecSearchHandler(File indexPath, @Nullable Map<String, String> credentials) throws IOException { Preconditions.checkNotNull(indexPath); Preconditions.checkArgument(indexPath.exists()); // Can be null, in which case we don't check for credentials. this.credentials = credentials; IndexReader reader = DirectoryReader.open(FSDirectory.open(indexPath)); searcher = new IndexSearcher(reader); searcher.setSimilarity(new LMDirichletSimilarity(2500.0f)); }
From source file:cc.twittertools.search.api.TrecSearchThriftServer.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(HELP_OPTION, "show help")); options.addOption(OptionBuilder.withArgName("port").hasArg().withDescription("port").create(PORT_OPTION)); options.addOption(//from ww w . jav a 2s.co m OptionBuilder.withArgName("index").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("max number of threads in thread pool").create(MAX_THREADS_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg() .withDescription("file containing access tokens").create(CREDENTIALS_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(TrecSearchThriftServer.class.getName(), options); System.exit(-1); } int port = cmdline.hasOption(PORT_OPTION) ? Integer.parseInt(cmdline.getOptionValue(PORT_OPTION)) : DEFAULT_PORT; int maxThreads = cmdline.hasOption(MAX_THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_THREADS_OPTION)) : DEFAULT_MAX_THREADS; File index = new File(cmdline.getOptionValue(INDEX_OPTION)); Map<String, String> credentials = null; if (cmdline.hasOption(CREDENTIALS_OPTION)) { credentials = Maps.newHashMap(); File cfile = new File(cmdline.getOptionValue(CREDENTIALS_OPTION)); if (!cfile.exists()) { System.err.println("Error: " + cfile + " does not exist!"); System.exit(-1); } for (String s : Files.readLines(cfile, Charsets.UTF_8)) { try { String[] arr = s.split(":"); credentials.put(arr[0], arr[1]); } catch (Exception e) { // Catch any exceptions from parsing file contain access tokens System.err.println("Error reading access tokens from " + cfile + "!"); System.exit(-1); } } } if (!index.exists()) { System.err.println("Error: " + index + " does not exist!"); System.exit(-1); } IndexReader reader = DirectoryReader.open(MMapDirectory.open(index)); IndexSearcher searcher = new IndexSearcher(reader); searcher.setSimilarity(new LMDirichletSimilarity(DEFAULT_MU)); QueryLikelihoodModel qlModel = new QueryLikelihoodModel(DEFAULT_MU); TServerSocket serverSocket = new TServerSocket(port); TrecSearch.Processor<TrecSearch.Iface> searchProcessor = new TrecSearch.Processor<TrecSearch.Iface>( new TrecSearchHandler(searcher, qlModel, credentials)); TThreadPoolServer.Args serverArgs = new TThreadPoolServer.Args(serverSocket); serverArgs.maxWorkerThreads(maxThreads); TServer thriftServer = new TThreadPoolServer( serverArgs.processor(searchProcessor).protocolFactory(new TBinaryProtocol.Factory())); thriftServer.serve(); }
From source file:cc.twittertools.search.local.RunQueries.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(//from w w w . j a v a 2 s . co m OptionBuilder.withArgName("path").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return") .create(NUM_RESULTS_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg() .withDescription("file containing topics in TREC format").create(QUERIES_OPTION)); options.addOption(OptionBuilder.withArgName("similarity").hasArg() .withDescription("similarity to use (BM25, LM)").create(SIMILARITY_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION)); options.addOption(new Option(VERBOSE_OPTION, "print out complete document")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(QUERIES_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(RunQueries.class.getName(), options); System.exit(-1); } File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION)); if (!indexLocation.exists()) { System.err.println("Error: " + indexLocation + " does not exist!"); System.exit(-1); } String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG; String topicsFile = cmdline.getOptionValue(QUERIES_OPTION); int numResults = 1000; try { if (cmdline.hasOption(NUM_RESULTS_OPTION)) { numResults = Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)); } } catch (NumberFormatException e) { System.err.println("Invalid " + NUM_RESULTS_OPTION + ": " + cmdline.getOptionValue(NUM_RESULTS_OPTION)); System.exit(-1); } String similarity = "LM"; if (cmdline.hasOption(SIMILARITY_OPTION)) { similarity = cmdline.getOptionValue(SIMILARITY_OPTION); } boolean verbose = cmdline.hasOption(VERBOSE_OPTION); PrintStream out = new PrintStream(System.out, true, "UTF-8"); IndexReader reader = DirectoryReader.open(FSDirectory.open(indexLocation)); IndexSearcher searcher = new IndexSearcher(reader); if (similarity.equalsIgnoreCase("BM25")) { searcher.setSimilarity(new BM25Similarity()); } else if (similarity.equalsIgnoreCase("LM")) { searcher.setSimilarity(new LMDirichletSimilarity(2500.0f)); } QueryParser p = new QueryParser(Version.LUCENE_43, StatusField.TEXT.name, IndexStatuses.ANALYZER); TrecTopicSet topics = TrecTopicSet.fromFile(new File(topicsFile)); for (TrecTopic topic : topics) { Query query = p.parse(topic.getQuery()); Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, topic.getQueryTweetTime(), true, true); TopDocs rs = searcher.search(query, filter, numResults); int i = 1; for (ScoreDoc scoreDoc : rs.scoreDocs) { Document hit = searcher.doc(scoreDoc.doc); out.println(String.format("%s Q0 %s %d %f %s", topic.getId(), hit.getField(StatusField.ID.name).numericValue(), i, scoreDoc.score, runtag)); if (verbose) { out.println("# " + hit.toString().replaceAll("[\\n\\r]+", " ")); } i++; } } reader.close(); out.close(); }
From source file:cc.twittertools.search.local.SearchStatuses.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(/*from w w w .j a v a 2 s . c o m*/ OptionBuilder.withArgName("path").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("query id").create(QID_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("query text").create(QUERY_OPTION)); options.addOption( OptionBuilder.withArgName("string").hasArg().withDescription("runtag").create(RUNTAG_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("maxid").create(MAX_ID_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of results to return") .create(NUM_RESULTS_OPTION)); options.addOption(OptionBuilder.withArgName("similarity").hasArg() .withDescription("similarity to use (BM25, LM)").create(SIMILARITY_OPTION)); options.addOption(new Option(VERBOSE_OPTION, "print out complete document")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(QUERY_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(SearchStatuses.class.getName(), options); System.exit(-1); } File indexLocation = new File(cmdline.getOptionValue(INDEX_OPTION)); if (!indexLocation.exists()) { System.err.println("Error: " + indexLocation + " does not exist!"); System.exit(-1); } String qid = cmdline.hasOption(QID_OPTION) ? cmdline.getOptionValue(QID_OPTION) : DEFAULT_QID; String queryText = cmdline.hasOption(QUERY_OPTION) ? cmdline.getOptionValue(QUERY_OPTION) : DEFAULT_Q; String runtag = cmdline.hasOption(RUNTAG_OPTION) ? cmdline.getOptionValue(RUNTAG_OPTION) : DEFAULT_RUNTAG; long maxId = cmdline.hasOption(MAX_ID_OPTION) ? Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION)) : DEFAULT_MAX_ID; int numResults = cmdline.hasOption(NUM_RESULTS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(NUM_RESULTS_OPTION)) : DEFAULT_NUM_RESULTS; boolean verbose = cmdline.hasOption(VERBOSE_OPTION); String similarity = "LM"; if (cmdline.hasOption(SIMILARITY_OPTION)) { similarity = cmdline.getOptionValue(SIMILARITY_OPTION); } PrintStream out = new PrintStream(System.out, true, "UTF-8"); IndexReader reader = DirectoryReader.open(FSDirectory.open(indexLocation)); IndexSearcher searcher = new IndexSearcher(reader); if (similarity.equalsIgnoreCase("BM25")) { searcher.setSimilarity(new BM25Similarity()); } else if (similarity.equalsIgnoreCase("LM")) { searcher.setSimilarity(new LMDirichletSimilarity(2500.0f)); } QueryParser p = new QueryParser(Version.LUCENE_43, IndexStatuses.StatusField.TEXT.name, IndexStatuses.ANALYZER); Query query = p.parse(queryText); Filter filter = NumericRangeFilter.newLongRange(StatusField.ID.name, 0L, maxId, true, true); TopDocs rs = searcher.search(query, filter, numResults); int i = 1; for (ScoreDoc scoreDoc : rs.scoreDocs) { Document hit = searcher.doc(scoreDoc.doc); out.println(String.format("%s Q0 %s %d %f %s", qid, hit.getField(StatusField.ID.name).numericValue(), i, scoreDoc.score, runtag)); if (verbose) { out.println("# " + hit.toString().replaceAll("[\\n\\r]+", " ")); } i++; } reader.close(); out.close(); }
From source file:cc.wikitools.lucene.hadoop.HdfsWikipediaSearcher.java
License:Apache License
public HdfsWikipediaSearcher(Path indexLocation, Configuration conf) throws IOException { FileSystem fs = FileSystem.get(conf); Directory directory = new FileSystemDirectory(fs, indexLocation, false, conf); reader = DirectoryReader.open(directory); init();// www.j a va2s.c om }
From source file:cc.wikitools.lucene.WikipediaSearcher.java
License:Apache License
public WikipediaSearcher(File indexLocation) throws IOException { Preconditions.checkNotNull(indexLocation); Preconditions.checkArgument(indexLocation.exists()); reader = DirectoryReader.open(FSDirectory.open(indexLocation)); init();// w ww. jav a 2 s . co m }
From source file:ch.admin.isb.hermes5.business.search.SearchEngine.java
License:Apache License
public List<SearchResult> search(String searchInput, String modelIdentifier, String lang) { DirectoryReader directoryReader = null; try {//from w w w . j a va 2 s.co m List<SearchResult> results = new ArrayList<SearchResult>(); String queryString = buildQueryString(searchInput); if (queryString != null) { String searchIndexPath = searchIndexManager.getSearchIndexPath(modelIdentifier, lang); try { directoryReader = DirectoryReader.open(FSDirectory.open(new File(searchIndexPath))); } catch (Exception e) { logger.warn("No index files found at " + searchIndexPath + ". Will try to restore from S3"); searchIndexManager.restoreIndexFilesFromS3(modelIdentifier, lang); directoryReader = DirectoryReader.open(FSDirectory.open(new File(searchIndexPath))); } Analyzer analyzer = analyserRepository.getAnalyzer(lang); IndexSearcher isearcher = new IndexSearcher(directoryReader); Query query = new QueryParser(Version.LUCENE_47, "presentationName", analyzer).parse(queryString); ScoreDoc[] hits = isearcher.search(query, null, 1000).scoreDocs; HighlighterWrapper highlighter = highlighterRepository.getHighlighter(analyzer, isearcher, query); for (int i = 0; i < hits.length; i++) { results.add(buildSearchResult(isearcher, highlighter, hits[i].doc)); } } searchLogger.info("<{}> returned {} results", searchInput, results.size()); return results; } catch (Exception e) { logger.warn("An exception occurred during search, empty result will be returned", e); return new ArrayList<SearchResult>(); } finally { try { if (directoryReader != null) { directoryReader.close(); } } catch (IOException e) { logger.debug("unable to close directory reader", e); } } }