List of usage examples for org.apache.lucene.store FSDirectory open
public static FSDirectory open(Path path) throws IOException
From source file:SearchFiles.java
License:Apache License
/** Simple command-line based search demo. */ public static void main(String[] args) throws Exception { String usage = "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage]\n\nSee http://lucene.apache.org/core/4_1_0/demo/ for details."; if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) { System.out.println(usage); System.exit(0);//from w ww .j a v a 2 s . c o m } String index = "index"; String field = "contents"; String queries = null; int repeat = 0; boolean raw = false; String queryString = null; int hitsPerPage = 10; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { index = args[i + 1]; i++; } else if ("-field".equals(args[i])) { field = args[i + 1]; i++; } else if ("-queries".equals(args[i])) { queries = args[i + 1]; i++; } else if ("-query".equals(args[i])) { queryString = args[i + 1]; i++; } else if ("-repeat".equals(args[i])) { repeat = Integer.parseInt(args[i + 1]); i++; } else if ("-raw".equals(args[i])) { raw = true; } else if ("-paging".equals(args[i])) { hitsPerPage = Integer.parseInt(args[i + 1]); if (hitsPerPage <= 0) { System.err.println("There must be at least 1 hit per page."); System.exit(1); } i++; } } IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(); BufferedReader in = null; if (queries != null) { in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8); } else { in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); } QueryParser parser = new QueryParser(field, analyzer); while (true) { if (queries == null && queryString == null) { // prompt the user System.out.println("Enter query: "); } String line = queryString != null ? queryString : in.readLine(); if (line == null || line.length() == -1) { break; } line = line.trim(); if (line.length() == 0) { break; } Query query = parser.parse(line); System.out.println("Searching for: " + query.toString(field)); if (repeat > 0) { // repeat & time as benchmark Date start = new Date(); for (int i = 0; i < repeat; i++) { searcher.search(query, 100); } Date end = new Date(); System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms"); } doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null); if (queryString != null) { break; } } reader.close(); }
From source file:VocabDumper.java
void printWords() throws IOException { IndexReader indexReader = DirectoryReader.open(FSDirectory.open(indexDir)); Fields fields = MultiFields.getFields(indexReader); String[] fieldNames = { MSIRDoc.FIELD_TITLE_EN, MSIRDoc.FIELD_EN, MSIRDoc.FIELD_TITLE_HN, MSIRDoc.FIELD_HN };/*from w w w. ja va2 s . c o m*/ for (String fieldName : fieldNames) { Terms terms = fields.terms(fieldName); TermsEnum iterator = terms.iterator(null); BytesRef byteRef = null; while ((byteRef = iterator.next()) != null) { String term = new String(byteRef.bytes, byteRef.offset, byteRef.length); if (term.indexOf('#') == -1) System.out.println(term); } } }
From source file:WriteIndex.java
License:Apache License
/** * @param args//w w w . j av a 2s . c o m */ public static void main(String[] args) throws IOException { File docs = new File("documents"); File indexDir = new File(INDEX_DIRECTORY); Directory directory = FSDirectory.open(indexDir); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_35); IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_35, analyzer); IndexWriter writer = new IndexWriter(directory, conf); writer.deleteAll(); for (File file : docs.listFiles()) { Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); ParseContext context = new ParseContext(); Parser parser = new AutoDetectParser(); InputStream stream = new FileInputStream(file); try { parser.parse(stream, handler, metadata, context); } catch (TikaException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } finally { stream.close(); } String text = handler.toString(); String fileName = file.getName(); Document doc = new Document(); doc.add(new Field("file", fileName, Store.YES, Index.NO)); for (String key : metadata.names()) { String name = key.toLowerCase(); String value = metadata.get(key); if (StringUtils.isBlank(value)) { continue; } if ("keywords".equalsIgnoreCase(key)) { for (String keyword : value.split(",?(\\s+)")) { doc.add(new Field(name, keyword, Store.YES, Index.NOT_ANALYZED)); } } else if ("title".equalsIgnoreCase(key)) { doc.add(new Field(name, value, Store.YES, Index.ANALYZED)); } else { doc.add(new Field(name, fileName, Store.YES, Index.NOT_ANALYZED)); } } doc.add(new Field("text", text, Store.NO, Index.ANALYZED)); writer.addDocument(doc); } writer.commit(); writer.deleteUnusedFiles(); System.out.println(writer.maxDoc() + " documents written"); }
From source file:ContentBasedAnalysis.java
License:Apache License
/** Simple command-line based search demo. */ public static void main(String[] args) throws Exception { String usage = "Usage:\tjava QueryConvert [-index dir]"; if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) { System.out.println(usage); System.exit(0);//from w ww. ja v a2s .c o m } String index = "index"; String field = "contents"; String queries = null; String queryString = null; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { index = args[i + 1]; i++; } else if ("-field".equals(args[i])) { field = args[i + 1]; i++; } } // Creation of reader and a searcher for the index IndexReader reader = IndexReader.open(FSDirectory.open(new File(index))); IndexSearcher searcher = new IndexSearcher(reader); // Reader to read File Names BufferedReader in = null; if (queries != null) { in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8")); } else { in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); } while (true) { System.out.println("Enter filename 1 (or hit <RETURN>): "); String f1 = in.readLine(); if (f1 == null || f1.length() == -1) break; f1 = f1.trim(); if (f1.length() == 0) break; System.out.println("Enter filename 2: "); String f2 = in.readLine(); int id1 = findDocId(searcher, f1); if (id1 < 0) { System.out.println("No file " + f1 + " found in index!"); break; } int id2 = findDocId(searcher, f2); if (id1 < 0) { System.out.println("No file " + f1 + " found in index!"); break; } // Conversion to TF-IDF format TermWeight[] v1 = toTfIdf(reader, id1); TermWeight[] v2 = toTfIdf(reader, id2); System.out.println("The cosine similarity of the two files is: " + cosineSimilarity(v1, v2)); } searcher.close(); reader.close(); }
From source file:SearcherTest.java
@Before public void setUp() throws Exception { dir = FSDirectory.open(Paths.get("E:\\lucene\\idx")); reader = DirectoryReader.open(dir);/*from w w w. j av a 2 s .c o m*/ is = new IndexSearcher(reader); }
From source file:MakeLuceneIndex.java
License:Apache License
/** Index all text files under a directory. * @throws UnsupportedEncodingException * @throws FileNotFoundException */ public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException { String baseDir = "/home/chrisschaefer/"; //String wikiDumpFile = "Downloads/enwiki-20130604-pages-articles.xml.bz2"; String wikiDumpFile = "enwiki-20130604-pages-articlese.xml.bz2"; String luceneIndexName = "enwiki-20130604-lucene2"; System.currentTimeMillis();/*from w w w . ja v a 2 s. co m*/ boolean bIgnoreStubs = false; for (int i = 0; i < args.length; ++i) { if (args[i].equals("-luceneindex")) luceneIndexName = args[++i]; if (args[i].equals("-basedir")) baseDir = args[++i]; if (args[i].equals("-dumpfile")) wikiDumpFile = args[++i]; if (args[i].equals("-includestubs")) bIgnoreStubs = true; } String rawTextPath = baseDir + luceneIndexName + "-raw-text.txt"; String logPath = baseDir + luceneIndexName + ".log"; PrintWriter artikelTextWriter = new PrintWriter(rawTextPath, "UTF-8"); PrintWriter logger = new PrintWriter(logPath, "UTF-8"); logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'"); System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'"); Date start = new Date(); try { Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName)); Analyzer analyzer = new WikipediaAnalyzer(); // Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); iwc.setSimilarity(new ESASimilarity()); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmxm or -Xmx1g): // iwc.setRAMBufferSizeMB(2000.0); IndexWriter writer = new IndexWriter(dir, iwc); Extractor wikidumpExtractor = new Extractor(baseDir + File.separator + wikiDumpFile); wikidumpExtractor.setLinkSeparator("_"); wikidumpExtractor.setCategorySeparator("_"); wikidumpExtractor.setTitleSeparator(" "); int iStubs = 0; int iArticleCount = 0; int iSkippedPageCount = 0; long iStartTime = java.lang.System.nanoTime(); long iTime = iStartTime; while (wikidumpExtractor.nextPage()) { if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) { ++iSkippedPageCount; continue; } if (bIgnoreStubs && wikidumpExtractor.getStub()) { ++iStubs; continue; } // skip pages with less than 5 out links if (wikidumpExtractor.getPageLinkList(true).size() < 5) { ++iSkippedPageCount; continue; } if (wikidumpExtractor.getPageCategories().equals("")) { ++iSkippedPageCount; logger.println("skipped because of stop category: " + wikidumpExtractor.getPageTitle(false)); continue; } else { for (String link : wikidumpExtractor.getPageLinkList(false)) { // artikelTextWriter.println(link); if (_inLinks.containsKey(link)) { int tmp = _inLinks.get(link); tmp++; _inLinks.put(link, tmp); } else { _inLinks.put(link, 1); } } } if (wikidumpExtractor.getPageText().equals("")) { ++iSkippedPageCount; continue; } artikelTextWriter.println( wikidumpExtractor.getPageTitle(false) + "\t" + wikidumpExtractor.getPageText(false)); ++iArticleCount; if (iArticleCount % 1000 == 0) { logger.println(new Date().toString() + " phase 1 -- iArticleCount: " + iArticleCount + " iSkippedPageCount: " + iSkippedPageCount); } } artikelTextWriter.close(); iArticleCount = 0; PrintWriter artikelInLinkWriter = new PrintWriter(baseDir + luceneIndexName + "-inlinks.txt", "UTF-8"); BufferedReader br = new BufferedReader(new FileReader(rawTextPath)); String line = br.readLine(); while (line != null) { int endOfTitle = line.indexOf("\t"); String title = line.substring(0, endOfTitle); if (_inLinks.containsKey(title)) { int inlinks = _inLinks.get(title); artikelInLinkWriter.println(title + "\t" + inlinks); if (inlinks > 4) { //System.out.println("inlinks > 0 "); Document doc = new Document(); ++iArticleCount; // wikidumpExtractor.setTitleSeparator( "_" ); // doc.add( new TextField( "url_title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) ); // doc.add( new TextField( "title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) ); //doc.add(new LongField("wiki_id", wikidumpExtractor.getPageId(), Field.Store.YES)); doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " " + line.substring(endOfTitle + 1), Field.Store.NO)); // System.out.println(title + " " + // title + " " + // title + " " + // title + " " + // line.substring(endOfTitle+1)); writer.addDocument(doc); if (iArticleCount % 1000 == 0) { writer.commit(); logger.println(new Date().toString() + " phase 2 -- iArticleCount: " + iArticleCount + " iSkippedPageCount: " + iSkippedPageCount); } } } else { artikelInLinkWriter.println(title + "\t0"); } line = br.readLine(); } br.close(); artikelInLinkWriter.close(); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // writer.commit(); writer.forceMerge(1); writer.close(); Date end = new Date(); String endStatement = end.getTime() - start.getTime() + " total milliseconds (" + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles."; logger.println(endStatement); System.out.println(endStatement); logger.close(); } catch (Exception e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:MakeLuceneIndexPreprocessed.java
License:Apache License
/** Index all text files under a directory. * @throws UnsupportedEncodingException * @throws FileNotFoundException */ public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException { String baseDir = "/home/chrisschaefer/"; String inputLuceneIndexName = "2013-06-18-lucene-gab"; String luceneIndexName = "2013-06-18-lucene-gab-standard"; System.currentTimeMillis();// ww w.j a v a2 s . co m for (int i = 0; i < args.length; ++i) { if (args[i].equals("-inputluceneindex")) inputLuceneIndexName = args[++i]; if (args[i].equals("-outputluceneindex")) luceneIndexName = args[++i]; if (args[i].equals("-basedir")) baseDir = args[++i]; } String rawTextPath = baseDir + inputLuceneIndexName + "-raw-text.txt"; String artikelInLinksPath = baseDir + inputLuceneIndexName + "-inlinks.txt"; String logPath = baseDir + inputLuceneIndexName + ".log"; PrintWriter logger = new PrintWriter(logPath, "UTF-8"); logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'"); System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'"); Date start = new Date(); logger.println(start.toString() + " iArticleCount: 0 iSkippedPageCount: 0"); try { Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName)); // Analyzer analyzer = new WikipediaAnalyzer(); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmxm or -Xmx1g): // iwc.setRAMBufferSizeMB(2000.0); // iwc.setSimilarity(new ESASimilarity()); IndexWriter writer = new IndexWriter(dir, iwc); int iArticleCount = 0; int iSkippedPageCount = 0; BufferedReader rawTextReader = new BufferedReader(new FileReader(rawTextPath)); BufferedReader artikelInLinksReader = new BufferedReader(new FileReader(artikelInLinksPath)); String lineText = rawTextReader.readLine(); String lineLinks = artikelInLinksReader.readLine(); while (lineText != null) { // String title = lineText.substring(0, lineText.indexOf("\t")); // while(!title.equals(lineLinks.substring(0, lineLinks.indexOf("\t")))){ // lineLinks = artikelInLinksReader.readLine(); // } int endOfTitle = lineText.indexOf("\t"); String title = lineText.substring(0, endOfTitle); if (Integer.valueOf(lineLinks.substring(lineLinks.indexOf("\t") + 1)) > 0) { ++iArticleCount; Document doc = new Document(); doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " " + lineText.substring(endOfTitle + 1), Field.Store.NO)); // System.out.println(title + " " + // title + " " + // title + " " + // title + " " + // lineText.substring(endOfTitle+1)); writer.addDocument(doc); if (iArticleCount % 1000 == 0) { writer.commit(); logger.println(new Date().toString() + "phase 2 -- iArticleCount: " + iArticleCount + " iSkippedPageCount: " + iSkippedPageCount); logger.flush(); } } lineText = rawTextReader.readLine(); lineLinks = artikelInLinksReader.readLine(); } rawTextReader.close(); artikelInLinksReader.close(); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // writer.commit(); writer.forceMerge(1); writer.close(); Date end = new Date(); String endStatement = end.getTime() - start.getTime() + " total milliseconds (" + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles."; logger.println(endStatement); System.out.println(endStatement); logger.close(); } catch (Exception e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:FileIndexer.java
License:Apache License
public static void main(String[] args) { String usage = "java FileIndexer" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-excludes FILE] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles\n" + "excludes is an optional list of files to be excluded, one per line."; String indexPath = "index"; String docsPath = null;/*from w w w . j a v a 2 s .c o m*/ boolean create = true; List<String> excludes = new ArrayList<String>(); for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1]; i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } else if ("-excludes".equals(args[i])) { Scanner sc = null; try { sc = new Scanner(new File(args[i + 1])); i++; } catch (FileNotFoundException fnfe) { System.err.println(fnfe.getMessage()); System.exit(1); } while (sc.hasNext()) { excludes.add(sc.next()); } sc.close(); } else if ("-update".equals(args[i])) { create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final Path docDir = Paths.get(docsPath); if (!Files.isReadable(docDir)) { System.out.println("Document directory '" + docDir.toAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(Paths.get(indexPath)); Analyzer analyzer = new LimitTokenCountAnalyzer(new StandardAnalyzer(), 1000000); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir, excludes); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:IrqaQuery.java
License:Apache License
public static void makeIndexWriter(String indexPath, String stopPath, String sim) throws IOException { System.out.println("[makeIndexWriter] started"); System.out.println("[makeIndexWriter]" + stopPath); Directory dir = FSDirectory.open(Paths.get(indexPath)); Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stopPath))); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (sim.equals("TFIDF")) iwc.setSimilarity(new ClassicSimilarity()); else if (sim.equals("BM25")) iwc.setSimilarity(new BM25Similarity()); else//www. j a v a 2 s. c o m iwc.setSimilarity(new BM25Similarity()); writer = new IndexWriter(dir, iwc); }
From source file:IrqaQuery.java
License:Apache License
public static List<Document> query(String index, String stoppath, String question, int numResult, String sim) throws Exception { IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stoppath))); if (sim.equals("TFIDF")) searcher.setSimilarity(new ClassicSimilarity()); else if (sim.equals("BM25")) searcher.setSimilarity(new BM25Similarity()); else//from w w w . j a v a 2 s .c om searcher.setSimilarity(new BM25Similarity()); String field = "contents"; QueryParser parser = new QueryParser(field, analyzer); Query query = parser.parse(parser.escape(question)); TopDocs results = searcher.search(query, numResult); ScoreDoc[] hits = results.scoreDocs; List<Document> docs = new ArrayList<Document>(); int numTotalHits = results.totalHits; // System.out.println(numTotalHits + " total matching documents"); int end = Math.min(numTotalHits, numResult); String searchResult = ""; // System.out.println("Only results 1 - " + hits.length); for (int i = 0; i < end; i++) { Document doc = searcher.doc(hits[i].doc); docs.add(doc); } return docs; }