List of usage examples for org.apache.lucene.index IndexWriterConfig setOpenMode
public IndexWriterConfig setOpenMode(OpenMode openMode)
From source file:ca.pgon.freenetknowledge.search.impl.LuceneIndexerThread.java
License:Apache License
private IndexWriter genIndexWriter() throws CorruptIndexException, LockObtainFailedException, IOException { IndexWriterConfig indexWriterConfig = new IndexWriterConfig(LuceneSearchEngine.LUCENE_VERSION, analyzer); indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); return new IndexWriter(directory, indexWriterConfig); }
From source file:ca.ualberta.entitylinking.common.indexing.DocumentIndexer.java
License:Open Source License
public void initWriter(String indexDir, boolean create) { try {/* w ww .j ava2s . co m*/ Directory dir = FSDirectory.open(new File(indexDir)); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_34, null); // create a new index if (create) iwc.setOpenMode(OpenMode.CREATE); else iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); writer = new IndexWriter(dir, iwc); } catch (IOException e) { e.printStackTrace(); } }
From source file:ca.ualberta.entitylinking.common.indexing.WikipediaIndex.java
License:Open Source License
public void initWriter(String indexDir, boolean create) { try {/*w ww . j a va 2s .c om*/ Directory dir = FSDirectory.open(new File(indexDir)); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_34, new StandardAnalyzer(Version.LUCENE_34)); // create a new index if (create) iwc.setOpenMode(OpenMode.CREATE); else iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); writer = new IndexWriter(dir, iwc); } catch (IOException e) { e.printStackTrace(); } }
From source file:cc.pp.analyzer.ik.demo.IKAnalyzerDemo.java
License:Apache License
public static void main(String[] args) { //Lucene Document?? String fieldName = "text"; ////ww w . ja v a 2 s . com String text = "IK Analyzer???????"; //IKAnalyzer? Analyzer analyzer = new IKAnalyzer(Version.LUCENE_48, true); Directory directory = null; IndexWriter iwriter = null; DirectoryReader ireader = null; IndexSearcher isearcher = null; try { // directory = new RAMDirectory(); //?IndexWriterConfig IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_48, analyzer); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); iwriter = new IndexWriter(directory, iwConfig); // Document doc = new Document(); doc.add(new LongField("ID", 1000, Field.Store.YES)); doc.add(new TextField(fieldName, text, Field.Store.YES)); iwriter.addDocument(doc); iwriter.close(); //?********************************** //? ireader = DirectoryReader.open(directory); isearcher = new IndexSearcher(ireader); String keyword = "?"; // String keyword = ""; //QueryParser?Query QueryParser qp = new QueryParser(Version.LUCENE_48, fieldName, analyzer); qp.setDefaultOperator(QueryParser.AND_OPERATOR); Query query = qp.parse(keyword); System.out.println("Query = " + query); //?5? TopDocs topDocs = isearcher.search(query, 5); System.out.println("" + topDocs.totalHits); // ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (int i = 0; i < topDocs.totalHits; i++) { Document targetDoc = isearcher.doc(scoreDocs[i].doc); System.out.println("" + targetDoc.toString()); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { e.printStackTrace(); } } if (directory != null) { try { directory.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:cc.twittertools.index.IndexStatuses.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(new Option(HELP_OPTION, "show help")); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); options.addOption(new Option(STORE_TERM_VECTORS_OPTION, "store term vectors")); options.addOption(OptionBuilder.withArgName("dir").hasArg().withDescription("source collection directory") .create(COLLECTION_OPTION)); options.addOption(/* w w w . ja v a2 s.c om*/ OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("file").hasArg().withDescription("file with deleted tweetids") .create(DELETES_OPTION)); options.addOption(OptionBuilder.withArgName("id").hasArg().withDescription("max id").create(MAX_ID_OPTION)); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (cmdline.hasOption(HELP_OPTION) || !cmdline.hasOption(COLLECTION_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(IndexStatuses.class.getName(), options); System.exit(-1); } String collectionPath = cmdline.getOptionValue(COLLECTION_OPTION); String indexPath = cmdline.getOptionValue(INDEX_OPTION); final FieldType textOptions = new FieldType(); textOptions.setIndexed(true); textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); textOptions.setStored(true); textOptions.setTokenized(true); if (cmdline.hasOption(STORE_TERM_VECTORS_OPTION)) { textOptions.setStoreTermVectors(true); } LOG.info("collection: " + collectionPath); LOG.info("index: " + indexPath); LongOpenHashSet deletes = null; if (cmdline.hasOption(DELETES_OPTION)) { deletes = new LongOpenHashSet(); File deletesFile = new File(cmdline.getOptionValue(DELETES_OPTION)); if (!deletesFile.exists()) { System.err.println("Error: " + deletesFile + " does not exist!"); System.exit(-1); } LOG.info("Reading deletes from " + deletesFile); FileInputStream fin = new FileInputStream(deletesFile); byte[] ignoreBytes = new byte[2]; fin.read(ignoreBytes); // "B", "Z" bytes from commandline tools BufferedReader br = new BufferedReader(new InputStreamReader(new CBZip2InputStream(fin))); String s; while ((s = br.readLine()) != null) { if (s.contains("\t")) { deletes.add(Long.parseLong(s.split("\t")[0])); } else { deletes.add(Long.parseLong(s)); } } br.close(); fin.close(); LOG.info("Read " + deletes.size() + " tweetids from deletes file."); } long maxId = Long.MAX_VALUE; if (cmdline.hasOption(MAX_ID_OPTION)) { maxId = Long.parseLong(cmdline.getOptionValue(MAX_ID_OPTION)); LOG.info("index: " + maxId); } long startTime = System.currentTimeMillis(); File file = new File(collectionPath); if (!file.exists()) { System.err.println("Error: " + file + " does not exist!"); System.exit(-1); } StatusStream stream = new JsonStatusCorpusReader(file); Directory dir = FSDirectory.open(new File(indexPath)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, IndexStatuses.ANALYZER); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); int cnt = 0; Status status; try { while ((status = stream.next()) != null) { if (status.getText() == null) { continue; } // Skip deletes tweetids. if (deletes != null && deletes.contains(status.getId())) { continue; } if (status.getId() > maxId) { continue; } cnt++; Document doc = new Document(); doc.add(new LongField(StatusField.ID.name, status.getId(), Field.Store.YES)); doc.add(new LongField(StatusField.EPOCH.name, status.getEpoch(), Field.Store.YES)); doc.add(new TextField(StatusField.SCREEN_NAME.name, status.getScreenname(), Store.YES)); doc.add(new Field(StatusField.TEXT.name, status.getText(), textOptions)); doc.add(new IntField(StatusField.FRIENDS_COUNT.name, status.getFollowersCount(), Store.YES)); doc.add(new IntField(StatusField.FOLLOWERS_COUNT.name, status.getFriendsCount(), Store.YES)); doc.add(new IntField(StatusField.STATUSES_COUNT.name, status.getStatusesCount(), Store.YES)); long inReplyToStatusId = status.getInReplyToStatusId(); if (inReplyToStatusId > 0) { doc.add(new LongField(StatusField.IN_REPLY_TO_STATUS_ID.name, inReplyToStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.IN_REPLY_TO_USER_ID.name, status.getInReplyToUserId(), Field.Store.YES)); } String lang = status.getLang(); if (!lang.equals("unknown")) { doc.add(new TextField(StatusField.LANG.name, status.getLang(), Store.YES)); } long retweetStatusId = status.getRetweetedStatusId(); if (retweetStatusId > 0) { doc.add(new LongField(StatusField.RETWEETED_STATUS_ID.name, retweetStatusId, Field.Store.YES)); doc.add(new LongField(StatusField.RETWEETED_USER_ID.name, status.getRetweetedUserId(), Field.Store.YES)); doc.add(new IntField(StatusField.RETWEET_COUNT.name, status.getRetweetCount(), Store.YES)); if (status.getRetweetCount() < 0 || status.getRetweetedStatusId() < 0) { LOG.warn("Error parsing retweet fields of " + status.getId()); } } writer.addDocument(doc); if (cnt % 100000 == 0) { LOG.info(cnt + " statuses indexed"); } } LOG.info(String.format("Total of %s statuses added", cnt)); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); stream.close(); } }
From source file:cc.wikitools.lucene.IndexWikipediaDump.java
License:Apache License
@SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("bz2 Wikipedia XML dump file") .create(INPUT_OPTION));/*from ww w. ja va2 s . c o m*/ options.addOption( OptionBuilder.withArgName("dir").hasArg().withDescription("index location").create(INDEX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg() .withDescription("maximum number of documents to index").create(MAX_OPTION)); options.addOption(OptionBuilder.withArgName("num").hasArg().withDescription("number of indexing threads") .create(THREADS_OPTION)); options.addOption(new Option(OPTIMIZE_OPTION, "merge indexes into a single segment")); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(INPUT_OPTION) || !cmdline.hasOption(INDEX_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(IndexWikipediaDump.class.getCanonicalName(), options); System.exit(-1); } String indexPath = cmdline.getOptionValue(INDEX_OPTION); int maxdocs = cmdline.hasOption(MAX_OPTION) ? Integer.parseInt(cmdline.getOptionValue(MAX_OPTION)) : Integer.MAX_VALUE; int threads = cmdline.hasOption(THREADS_OPTION) ? Integer.parseInt(cmdline.getOptionValue(THREADS_OPTION)) : DEFAULT_NUM_THREADS; long startTime = System.currentTimeMillis(); String path = cmdline.getOptionValue(INPUT_OPTION); PrintStream out = new PrintStream(System.out, true, "UTF-8"); WikiClean cleaner = new WikiCleanBuilder().withTitle(true).build(); Directory dir = FSDirectory.open(new File(indexPath)); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, ANALYZER); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); LOG.info("Creating index at " + indexPath); LOG.info("Indexing with " + threads + " threads"); try { WikipediaBz2DumpInputStream stream = new WikipediaBz2DumpInputStream(path); ExecutorService executor = Executors.newFixedThreadPool(threads); int cnt = 0; String page; while ((page = stream.readNext()) != null) { String title = cleaner.getTitle(page); // These are heuristic specifically for filtering out non-articles in enwiki-20120104. if (title.startsWith("Wikipedia:") || title.startsWith("Portal:") || title.startsWith("File:")) { continue; } if (page.contains("#REDIRECT") || page.contains("#redirect") || page.contains("#Redirect")) { continue; } Runnable worker = new AddDocumentRunnable(writer, cleaner, page); executor.execute(worker); cnt++; if (cnt % 10000 == 0) { LOG.info(cnt + " articles added"); } if (cnt >= maxdocs) { break; } } executor.shutdown(); // Wait until all threads are finish while (!executor.isTerminated()) { } LOG.info("Total of " + cnt + " articles indexed."); if (cmdline.hasOption(OPTIMIZE_OPTION)) { LOG.info("Merging segments..."); writer.forceMerge(1); LOG.info("Done!"); } LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms"); } catch (Exception e) { e.printStackTrace(); } finally { writer.close(); dir.close(); out.close(); } }
From source file:ch.ksfx.web.services.lucene.AsynchronousIndexer.java
License:Open Source License
public void run() { while (true) { try {/*from w w w . ja v a2 s .c o m*/ if (writer == null || !writer.isOpen()) { IndexWriterConfig iwc = new IndexWriterConfig(analyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); writer = new IndexWriter(dir, iwc); } IndexEvent indexEvent = queuedIndexEvents.take(); String sortableDateTime = null; if (indexEvent.getDeleteEvent()) { if (indexEvent instanceof DeleteSeriesObservationsEvent) { writer.deleteDocuments(new Term("series_id", indexEvent.getSeriesId().toString())); } else { sortableDateTime = DateFormatUtil.formatToLexicographicallySortableTimeAndDateString( indexEvent.getObservationTime()); writer.deleteDocuments(new Term("internal_id", indexEvent.getSeriesId().toString() + sortableDateTime + indexEvent.getSourceId())); } if (queuedIndexEvents.size() == 0) { writer.close(); } continue; } Observation obs = observationDAO.getObservationForTimeSeriesIdObservationTimeAndSourceId( indexEvent.getSeriesId(), indexEvent.getObservationTime(), indexEvent.getSourceId()); Document doc = new Document(); sortableDateTime = DateFormatUtil .formatToLexicographicallySortableTimeAndDateString(obs.getObservationTime()); String isoDateTime = DateFormatUtil.formatToISO8601TimeAndDateString(obs.getObservationTime()); doc.add(new StringField("internal_id", obs.getTimeSeriesId().toString() + sortableDateTime + obs.getSourceId(), Field.Store.NO)); doc.add(new StringField("series_id", obs.getTimeSeriesId().toString(), Field.Store.YES)); doc.add(new StringField("observation_time", isoDateTime, Field.Store.YES)); doc.add(new StringField("sortable_observation_time", sortableDateTime, Field.Store.NO)); doc.add(new StringField("source_id", obs.getSourceId(), Field.Store.YES)); addField(doc, "source_uri", obs.getSourceId()); addField(doc, "scalar_value", obs.getScalarValue()); for (String key : obs.getComplexValue().keySet()) { addField(doc, key, obs.getComplexValue().get(key)); } //System.out.println("Meta data to index: " + obs.getMetaData() + " Size: " + obs.getMetaData().size()); for (String key : obs.getMetaData().keySet()) { System.out.println("Indexing meta data: " + key + " --> " + obs.getMetaData().get(key)); addField(doc, key, obs.getMetaData().get(key)); } writer.updateDocument(new Term("internal_id", obs.getTimeSeriesId().toString() + sortableDateTime + obs.getSourceId()), doc); if (queuedIndexEvents.size() == 0) { writer.close(); } } catch (Exception e) { logger.error("Error while Asynchronous Indexing", e); } } }
From source file:choco.lucene.IKAnalyzerDemo.java
License:Apache License
public static void main(String[] args) { //Lucene Document?? String fieldName = "text"; // /*w ww.j a v a2s .c o m*/ String text = "IK Analyzer???????"; //IKAnalyzer? Analyzer analyzer = new IKAnalyzer(); Directory directory = null; IndexWriter iwriter = null; IndexReader ireader = null; IndexSearcher isearcher = null; try { // directory = new RAMDirectory(); //?IndexWriterConfig IndexWriterConfig iwConfig = new IndexWriterConfig(Version.LUCENE_34, analyzer); iwConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); iwriter = new IndexWriter(directory, iwConfig); // Document doc = new Document(); doc.add(new Field("ID", "10000", Field.Store.YES, Field.Index.NOT_ANALYZED)); doc.add(new Field(fieldName, text, Field.Store.YES, Field.Index.ANALYZED)); iwriter.addDocument(doc); iwriter.close(); //?********************************** //? ireader = IndexReader.open(directory); isearcher = new IndexSearcher(ireader); String keyword = "?"; //QueryParser?Query QueryParser qp = new QueryParser(Version.LUCENE_34, fieldName, analyzer); qp.setDefaultOperator(QueryParser.AND_OPERATOR); Query query = qp.parse(keyword); //?5? TopDocs topDocs = isearcher.search(query, 5); System.out.println("" + topDocs.totalHits); // ScoreDoc[] scoreDocs = topDocs.scoreDocs; for (int i = 0; i < topDocs.totalHits; i++) { Document targetDoc = isearcher.doc(scoreDocs[i].doc); System.out.println("" + targetDoc.toString()); } } catch (CorruptIndexException e) { e.printStackTrace(); } catch (LockObtainFailedException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (ParseException e) { e.printStackTrace(); } finally { if (ireader != null) { try { ireader.close(); } catch (IOException e) { e.printStackTrace(); } } if (directory != null) { try { directory.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:ci6226.buildindex.java
/** * @param args the command line arguments *///from w w w . j a va 2 s. c o m public static void main(String[] args) throws FileNotFoundException, IOException, ParseException { String file = "/home/steven/Dropbox/workspace/ntu_coursework/ci6226/Assiment/yelpdata/yelp_training_set/yelp_training_set_review.json"; JSONParser parser = new JSONParser(); BufferedReader in = new BufferedReader(new FileReader(file)); // List<Document> jdocs = new LinkedList<Document>(); Date start = new Date(); String indexPath = "./myindex"; System.out.println("Indexing to directory '" + indexPath + "'..."); // Analyzer analyzer= new NGramAnalyzer(2,8); Analyzer analyzer = new myAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer); Directory dir = FSDirectory.open(new File(indexPath)); // :Post-Release-Update-Version.LUCENE_XY: // TODO: try different analyzer,stop words,words steming check size // Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); // Add new documents to an existing index: // iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); // writer.addDocuments(jdocs); int line = 0; while (in.ready()) { String s = in.readLine(); Object obj = JSONValue.parse(s); JSONObject person = (JSONObject) obj; String text = (String) person.get("text"); String user_id = (String) person.get("user_id"); String business_id = (String) person.get("business_id"); String review_id = (String) person.get("review_id"); JSONObject votes = (JSONObject) person.get("votes"); long funny = (Long) votes.get("funny"); long cool = (Long) votes.get("cool"); long useful = (Long) votes.get("useful"); Document doc = new Document(); Field review_idf = new StringField("review_id", review_id, Field.Store.YES); doc.add(review_idf); Field business_idf = new StringField("business_id", business_id, Field.Store.YES); doc.add(business_idf); //http://qindongliang1922.iteye.com/blog/2030639 FieldType ft = new FieldType(); ft.setIndexed(true);// ft.setStored(true);// ft.setStoreTermVectors(true); ft.setTokenized(true); ft.setStoreTermVectorPositions(true);//? ft.setStoreTermVectorOffsets(true);//??? Field textf = new Field("text", text, ft); doc.add(textf); // Field user_idf = new StringField("user_id", user_id, Field.Store.YES); // doc.add(user_idf); // doc.add(new LongField("cool", cool, Field.Store.YES)); // doc.add(new LongField("funny", funny, Field.Store.YES)); // doc.add(new LongField("useful", useful, Field.Store.YES)); writer.addDocument(doc); System.out.println(line++); } writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); // BufferedReader in = new BufferedReader(new FileReader(file)); //while (in.ready()) { // String s = in.readLine(); // //System.out.println(s); // JSONObject jsonObject = (JSONObject) ((Object)s); // String rtext = (String) jsonObject.get("text"); // System.out.println(rtext); // //long age = (Long) jsonObject.get("age"); // //System.out.println(age); //} //in.close(); }
From source file:ci6226.eval_index_writer.java
public eval_index_writer(Analyzer _analyzer, String _iReviewLocation, String _dir) throws IOException { String file = _iReviewLocation; JSONParser parser = new JSONParser(); BufferedReader in = new BufferedReader(new FileReader(file)); Date start = new Date(); String indexPath = "./" + _dir; System.out.println("Indexing to directory '" + indexPath + "'..."); Analyzer analyzer = _analyzer;/*from w ww . j av a 2 s . co m*/ IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer); Directory dir = FSDirectory.open(new File(indexPath)); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); IndexWriter writer = new IndexWriter(dir, iwc); // int line=0; while (in.ready()) { String s = in.readLine(); Object obj = JSONValue.parse(s); JSONObject person = (JSONObject) obj; String text = (String) person.get("text"); String user_id = (String) person.get("user_id"); String business_id = (String) person.get("business_id"); String review_id = (String) person.get("review_id"); JSONObject votes = (JSONObject) person.get("votes"); long funny = (Long) votes.get("funny"); long cool = (Long) votes.get("cool"); long useful = (Long) votes.get("useful"); Document doc = new Document(); Field review_idf = new StringField("review_id", review_id, Field.Store.YES); doc.add(review_idf); // Field business_idf = new StringField("business_id", business_id, Field.Store.YES); // doc.add(business_idf); //http://qindongliang1922.iteye.com/blog/2030639 FieldType ft = new FieldType(); ft.setIndexed(true);// ft.setStored(true);// ft.setStoreTermVectors(true); ft.setTokenized(true); ft.setStoreTermVectorPositions(true);// ft.setStoreTermVectorOffsets(true);// Field textf = new Field("text", text, ft); doc.add(textf); // Field user_idf = new StringField("user_id", user_id, Field.Store.YES); // doc.add(user_idf); // doc.add(new LongField("cool", cool, Field.Store.YES)); // doc.add(new LongField("funny", funny, Field.Store.YES)); // doc.add(new LongField("useful", useful, Field.Store.YES)); writer.addDocument(doc); // System.out.println(line++); } writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); }