List of usage examples for org.apache.lucene.index IndexWriter close
@Override public void close() throws IOException
From source file:cc.wikitools.lucene.IndexWikipediaDump.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("bz2 Wikipedia XML dump file") .create(INPUT_OPTION));/*from w w w . j av a2s . co m*/ options.addOption( OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("maximum number of documents to index").create(MAX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexing threads") .create(THREADS_OPTION)); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(IndexWikipediaDump.class.getCanonicalName(), options); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX_OPTION); int maxdocs = cmdline.hasOption(MAX_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_OPTION)) : Integer.MAX_VALUE; int threads = cmdline.hasOption(THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION)) : DEFAULT_NUM_THREADS; long startTime = System.currentTimeMillis(); String path = cmdline.getOptionValue(INPUT_OPTION); PrintStream out = new PrintStream(System.out, true, "UTF-8"); WikiClean cleaner = new WikiCleanBuilder().withTitle(true).build(); Directory dir = FSDirectory.open(new File(indexPath)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); LOG.info("Creating index at " + indexPath); LOG.info("Indexing with " + threads + " threads"); try { WikipediaBz2DumpInputStream stream = new WikipediaBz2DumpInputStream(path); ExecutorService executor = Executors.newFixedThreadPool(threads); int cnt = 0; String page; while ((page = stream.readNext()) != null) { String title = cleaner.getTitle(page); // These are heuristic specifically for filtering out non-articles in enwiki-20120104. if (title.startsWith("Wikipedia:") || title.startsWith("Portal:") || title.startsWith("File:")) { continue; } if (page.contains("#REDIRECT") || page.contains("#redirect") || page.contains("#Redirect")) { continue; } Runnable worker = new AddDocumentRunnable(writer, cleaner, page); executor.execute(worker); cnt++; if (cnt % 10000 == 0) { LOG.info(cnt + " articles added"); } if (cnt >= maxdocs) { break; } } executor.shutdown(); // Wait until all threads are finish while (!executor.isTerminated()) { } LOG.info("Total of " + cnt + " articles indexed."); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); out.close(); } }
From source file:choco.lucene.IKAnalyzerDemo.java
License:Apache License
public static void main(String[] args) { //Lucene Document?? String fieldName = "text"; // /*w w w . j av a 2 s. c o m*/ String text = "IK Analyzer???????"; //IKAnalyzer? Analyzer analyzer = new IKAnalyzer(); Directory directory = null; IndexWriter iwriter = null; IndexReader ireader = null; IndexSearcher isearcher = null; try { // directory = new RAMDirectory(); //?IndexWriterConfig IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_34, analyzer); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); iwriter = new IndexWriter(directory, iwConfig); // Document doc = new Document(); doc.add(new Field("ID", "10000", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(fieldName, text, Field.Store.YES, Field.Index.ANALYZED)); iwriter.addDocument(doc); iwriter.close(); //?********************************** //? ireader = IndexReader.open(directory); isearcher = new IndexSearcher(ireader); String keyword = "?"; //QueryParser?Query QueryParser qp = new QueryParser(Version.LUCENE_34, fieldName, analyzer); qp.setDefaultOperator(QueryParser.AND_OPERATOR); Query query = qp.parse(keyword); //?5? TopDocs topDocs = isearcher.search(query, 5); System.out.println("" + topDocs.totalHits); // ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (int i = 0; i < topDocs.totalHits; i++) { Document targetDoc = isearcher.doc(scoreDocs[i].doc); System.out.println("" + targetDoc.toString()); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { e.printStackTrace(); } } if (directory != null) { try { directory.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:ci6226.buildindex.java
/** * @param args the command line arguments *///from ww w . j a v a2s. c o m public static void main(String[] args) throws FileNotFoundException, IOException, ParseException { String file = "/home/steven/Dropbox/workspace/ntu_coursework/ci6226/Assiment/yelpdata/yelp_training_set/yelp_training_set_review.json"; JSONParser parser = new JSONParser(); BufferedReader in = new BufferedReader(new FileReader(file)); // List<Document> jdocs = new LinkedList<Document>(); Date start = new Date(); String indexPath = "./myindex"; System.out.println("Indexing to directory '" + indexPath + "'..."); // Analyzer analyzer= new NGramAnalyzer(2,8); Analyzer analyzer = new myAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer); Directory dir = FSDirectory.open(new File(indexPath)); // :Post-Release-Update-Version.LUCENE_XY: // TODO: try different analyzer,stop words,words steming check size // Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); // Add new documents to an existing index: // iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); // writer.addDocuments(jdocs); int line = 0; while (in.ready()) { String s = in.readLine(); Object obj = JSONValue.parse(s); JSONObject person = (JSONObject) obj; String text = (String) person.get("text"); String user_id = (String) person.get("user_id"); String business_id = (String) person.get("business_id"); String review_id = (String) person.get("review_id"); JSONObject votes = (JSONObject) person.get("votes"); long funny = (Long) votes.get("funny"); long cool = (Long) votes.get("cool"); long useful = (Long) votes.get("useful"); Document doc = new Document(); Field review_idf = new StringField("review_id", review_id, Field.Store.YES); doc.add(review_idf); Field business_idf = new StringField("business_id", business_id, Field.Store.YES); doc.add(business_idf); //http://qindongliang1922.iteye.com/blog/2030639 FieldType ft = new FieldType(); ft.setIndexed(true);// ft.setStored(true);// ft.setStoreTermVectors(true); ft.setTokenized(true); ft.setStoreTermVectorPositions(true);//? ft.setStoreTermVectorOffsets(true);//??? Field textf = new Field("text", text, ft); doc.add(textf); // Field user_idf = new StringField("user_id", user_id, Field.Store.YES); // doc.add(user_idf); // doc.add(new LongField("cool", cool, Field.Store.YES)); // doc.add(new LongField("funny", funny, Field.Store.YES)); // doc.add(new LongField("useful", useful, Field.Store.YES)); writer.addDocument(doc); System.out.println(line++); } writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); // BufferedReader in = new BufferedReader(new FileReader(file)); //while (in.ready()) { // String s = in.readLine(); // //System.out.println(s); // JSONObject jsonObject = (JSONObject) ((Object)s); // String rtext = (String) jsonObject.get("text"); // System.out.println(rtext); // //long age = (Long) jsonObject.get("age"); // //System.out.println(age); //} //in.close(); }
From source file:ci6226.eval_index_writer.java
public eval_index_writer(Analyzer _analyzer, String _iReviewLocation, String _dir) throws IOException { String file = _iReviewLocation; JSONParser parser = new JSONParser(); BufferedReader in = new BufferedReader(new FileReader(file)); Date start = new Date(); String indexPath = "./" + _dir; System.out.println("Indexing to directory '" + indexPath + "'..."); Analyzer analyzer = _analyzer;//from w w w .j a v a 2 s.co m IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer); Directory dir = FSDirectory.open(new File(indexPath)); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); IndexWriter writer = new IndexWriter(dir, iwc); // int line=0; while (in.ready()) { String s = in.readLine(); Object obj = JSONValue.parse(s); JSONObject person = (JSONObject) obj; String text = (String) person.get("text"); String user_id = (String) person.get("user_id"); String business_id = (String) person.get("business_id"); String review_id = (String) person.get("review_id"); JSONObject votes = (JSONObject) person.get("votes"); long funny = (Long) votes.get("funny"); long cool = (Long) votes.get("cool"); long useful = (Long) votes.get("useful"); Document doc = new Document(); Field review_idf = new StringField("review_id", review_id, Field.Store.YES); doc.add(review_idf); // Field business_idf = new StringField("business_id", business_id, Field.Store.YES); // doc.add(business_idf); //http://qindongliang1922.iteye.com/blog/2030639 FieldType ft = new FieldType(); ft.setIndexed(true);// ft.setStored(true);// ft.setStoreTermVectors(true); ft.setTokenized(true); ft.setStoreTermVectorPositions(true);// ft.setStoreTermVectorOffsets(true);// Field textf = new Field("text", text, ft); doc.add(textf); // Field user_idf = new StringField("user_id", user_id, Field.Store.YES); // doc.add(user_idf); // doc.add(new LongField("cool", cool, Field.Store.YES)); // doc.add(new LongField("funny", funny, Field.Store.YES)); // doc.add(new LongField("useful", useful, Field.Store.YES)); writer.addDocument(doc); // System.out.println(line++); } writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); }
From source file:cn.codepub.redis.directory.Main.java
License:Apache License
public static void testRedisDirectoryWithShardedJedisPool() throws IOException { long start = System.currentTimeMillis(); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(new WhitespaceAnalyzer()) .setOpenMode(IndexWriterConfig.OpenMode.CREATE); //indexWriterConfig.setInfoStream(System.out); //indexWriterConfig.setRAMBufferSizeMB(2048); //LogByteSizeMergePolicy logByteSizeMergePolicy = new LogByteSizeMergePolicy(); //logByteSizeMergePolicy.setMinMergeMB(1); //logByteSizeMergePolicy.setMaxMergeMB(64); //logByteSizeMergePolicy.setMaxCFSSegmentSizeMB(64); //indexWriterConfig.setRAMBufferSizeMB(1024).setMergePolicy(logByteSizeMergePolicy).setUseCompoundFile(false); //GenericObjectPoolConfig genericObjectPoolConfig = new GenericObjectPoolConfig(); //?//from w w w . j a v a2 s . co m //genericObjectPoolConfig.setMaxWaitMillis(3000); //10s List<JedisShardInfo> shards = new ArrayList<>(); JedisShardInfo si = new JedisShardInfo("localhost", 6379, Constants.TIME_OUT); //JedisShardInfo si2 = new JedisShardInfo("localhost", 6380); shards.add(si); //shards.add(si2); JedisPoolConfig jedisPoolConfig = new JedisPoolConfig(); ShardedJedisPool shardedJedisPool = new ShardedJedisPool(jedisPoolConfig, shards); RedisDirectory redisDirectory = new RedisDirectory(new ShardedJedisPoolStream(shardedJedisPool)); IndexWriter indexWriter = new IndexWriter(redisDirectory, indexWriterConfig); for (int i = 0; i < 10000000; i++) { indexWriter.addDocument(addDocument(i)); } indexWriter.commit(); indexWriter.close(); redisDirectory.close(); long end = System.currentTimeMillis(); log.error("RedisDirectoryWithShardedJedisPool consumes {}s!", (end - start) / 1000); shardedJedisPool = new ShardedJedisPool(jedisPoolConfig, shards); start = System.currentTimeMillis(); IndexSearcher indexSearcher = new IndexSearcher( DirectoryReader.open(new RedisDirectory(new ShardedJedisPoolStream(shardedJedisPool)))); int total = 0; for (int i = 0; i < 10000000; i++) { TermQuery key1 = new TermQuery(new Term("key1", "key" + i)); TopDocs search = indexSearcher.search(key1, 10); total += search.totalHits; } System.out.println(total); end = System.currentTimeMillis(); log.error("RedisDirectoryWithShardedJedisPool search consumes {}ms!", (end - start)); }
From source file:cn.fql.blogspider.IndexMain.java
License:Open Source License
public static void main(String[] args) { String indexPath = "d:/test/index"; String docsPath = "d:/test/docs"; boolean create = true; File docDir = new File(docsPath); Date start = new Date(); try {/* ww w .j av a 2 s . co m*/ System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(new File(indexPath)); //Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40); Analyzer analyzer = new IKAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer); if (create) { iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } else { iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); } IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); writer.close(); Date end = new Date(); System.out.println((end.getTime() - start.getTime()) + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:cn.jcenterhome.web.action.CpAction.java
private List<String> getKeyWord(String text) throws IOException { List<String> keywords = new ArrayList<String>(); if (!Common.empty(text)) { Map<String, Integer> words = new HashMap<String, Integer>(); Analyzer analyzer = new IKAnalyzer(true); StringReader reader = new StringReader(text); TokenStream tokenStream = analyzer.tokenStream("*", reader); TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { String word = termAtt.term(); if (word.length() > 1 && Common.strlen(word) > 2) { Integer count = words.get(word); if (count == null) { count = 0;/*from w ww .java2 s. c o m*/ } words.put(word, count + 1); } } if (words.size() > 0) { Directory dir = null; IndexSearcher searcher = null; try { String fieldName = "text"; dir = new RAMDirectory(); IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED); Document doc = new Document(); doc.add(new Field(fieldName, text, Field.Store.YES, Field.Index.ANALYZED)); writer.addDocument(doc); writer.close(); searcher = new IndexSearcher(dir); searcher.setSimilarity(new IKSimilarity()); Set<String> keys = words.keySet(); Map<String, Float> temps = new HashMap<String, Float>(); for (String key : keys) { int count = words.get(key); Query query = IKQueryParser.parse(fieldName, key); TopDocs topDocs = searcher.search(query, 1); if (topDocs.totalHits > 0) { temps.put(key, topDocs.getMaxScore() * count); } } Entry<String, Float>[] keywordEntry = getSortedHashtableByValue(temps); for (Entry<String, Float> entry : keywordEntry) { if (keywords.size() < 5) { keywords.add(entry.getKey()); } } } catch (Exception e) { e.printStackTrace(); } finally { try { searcher.close(); } catch (IOException e) { e.printStackTrace(); } try { dir.close(); } catch (IOException e) { e.printStackTrace(); } } } } return keywords; }
From source file:cn.larry.search.book.index.IndexFiles.java
License:Apache License
/** Index all text files under a directory. */ public static void main(String[] args) { String usage = "java org.apache.lucene.demo.IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles"; String indexPath = "index"; String docsPath = null;//from w ww . j a v a 2s.com boolean create = true; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1]; i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } else if ("-update".equals(args[i])) { create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final Path docDir = Paths.get(docsPath); if (!Files.isReadable(docDir)) { System.out.println("Document directory '" + docDir.toAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(Paths.get(indexPath)); Analyzer analyzer = new StandardAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmxm or -Xmx1g): // // iwc.setRAMBufferSizeMB(.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:collene.TestIndexing.java
License:Apache License
@Test public void test() throws IOException, ParseException { Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9); // write it out. IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); IndexWriter writer = new IndexWriter(directory, config); for (int i = 0; i < 100; i++) { Collection<Document> documents = new ArrayList<Document>(); Document doc = new Document(); doc.add(new Field("key", "aaa_" + i, TextField.TYPE_STORED)); doc.add(new Field("not", "notaaa", TextField.TYPE_NOT_STORED)); doc.add(new Field("meta", "aaa_meta_aaa_" + i, TextField.TYPE_STORED)); documents.add(doc);//w w w . j a v a2 s .c o m writer.addDocuments(documents); writer.commit(); writer.forceMerge(1); writer.forceMergeDeletes(true); } // now read it back. IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(writer, false)); QueryParser parser = new QueryParser(Version.LUCENE_4_9, "key", analyzer); Query query = parser.parse("aaa_4"); TopDocs docs = searcher.search(query, 1); int idToDelete = docs.scoreDocs[0].doc; Assert.assertTrue(docs.totalHits > 0); query = parser.parse("fersoius"); docs = searcher.search(query, 1); Assert.assertFalse(docs.totalHits > 0); // delete that document. DirectoryReader reader = DirectoryReader.open(writer, true); writer.tryDeleteDocument(reader, idToDelete); reader.close(); writer.close(); // list files Set<String> files = new HashSet<String>(); System.out.println("Listing files for " + directory.toString()); for (String file : directory.listAll()) { files.add(file); System.out.println(" " + file); } if (strictFileChecking) { System.out.println("String file checking..."); Sets.SetView<String> difference = Sets.difference(expectedFiles, files); Assert.assertEquals(Joiner.on(",").join(difference), 0, difference.size()); } reader = DirectoryReader.open(directory); searcher = new IndexSearcher(reader); query = parser.parse("aaa_4"); docs = searcher.search(query, 1); reader.close(); Assert.assertFalse(docs.totalHits > 0); directory.close(); }
From source file:collene.TestLuceneAssumptions.java
License:Apache License
@Test public void testCanSeeUpdatesAfterAdd() throws Exception { // this verifies that any reader can see updates after documents are added. File fdir = TestUtil.getRandomTempDir(); pleaseDelete.add(fdir);//w ww . ja v a2s .co m Directory dir = FSDirectory.open(fdir); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_4_9); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_4_9, analyzer); config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); IndexWriter writer = new IndexWriter(dir, config); Document doc0 = new Document(); Document doc1 = new Document(); doc0.add(new Field("f0", "aaa", TextField.TYPE_STORED)); doc1.add(new Field("f0", "bbb", TextField.TYPE_STORED)); List<Document> docs = Lists.newArrayList(doc0, doc1); writer.addDocuments(docs, analyzer); IndexSearcher searcher = new IndexSearcher(DirectoryReader.open(writer, false)); QueryParser parser = new QueryParser(Version.LUCENE_4_9, "f0", new StandardAnalyzer(Version.LUCENE_4_9)); Query query = parser.parse("bbb"); TopDocs topDocs = searcher.search(query, 10); Assert.assertEquals(1, topDocs.totalHits); Assert.assertEquals(1, topDocs.scoreDocs.length); writer.close(); dir.close(); }