List of usage examples for org.apache.lucene.index IndexWriterConfig setOpenMode
public IndexWriterConfig setOpenMode(OpenMode openMode)
From source file:edu.cmu.geolocator.io.GetWriter.java
License:Apache License
public static IndexWriter getIndexWriter(String indexdirectory, double buffersize) throws IOException { Directory dir;/*from w ww. j a v a2 s .com*/ if (OSUtil.isWindows()) dir = FSDirectory.open(new File(indexdirectory)); else dir = NIOFSDirectory.open(new File(indexdirectory)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_45); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_45, analyzer); config.setOpenMode(OpenMode.CREATE_OR_APPEND); config.setRAMBufferSizeMB(buffersize); LogDocMergePolicy mergePolicy = new LogDocMergePolicy(); mergePolicy.setMergeFactor(3); config.setMergePolicy(mergePolicy); IndexWriter writer = new IndexWriter(dir, config); return writer; }
From source file:edu.cmu.lti.huiying.ir.rangedsearch.TableIndexer.java
License:Apache License
/** Index all text files under a directory. */ public static void main(String[] args) { String usage = "java org.apache.lucene.demo.IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles"; String indexPath = "index"; String docsPath = "/home/huiying/JavaWorkspace/TableSemantics/web/numericalDemo/public/data"; boolean create = true; int nfile = 0; // for(int i=0;i<args.length;i++) { // if ("-index".equals(args[i])) { // indexPath = args[i+1]; // i++;//from w ww .ja v a 2s. c om // } else if ("-docs".equals(args[i])) { // docsPath = args[i+1]; // i++; // } else if ("-update".equals(args[i])) { // create = false; // } // } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); TableIndexer tind = new TableIndexer(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(new File(indexPath)); // :Post-Release-Update-Version.LUCENE_XY: //Analyzer analyzer = new TableAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_4_10_0, new StandardAnalyzer()); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } IndexWriter writer = new IndexWriter(dir, iwc); // tind.indexOffsetAnnotation(writer, docDir); tind.indexExplodedXml(writer, docDir); writer.close(); Date end = new Date(); System.out.println("total doc added:" + tind.totalDocAdded); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:edu.cmu.lti.oaqa.knn4qa.apps.LuceneIndexer.java
License:Apache License
public static void main(String[] args) { Options options = new Options(); options.addOption(CommonParams.ROOT_DIR_PARAM, null, true, CommonParams.ROOT_DIR_DESC); options.addOption(CommonParams.SUB_DIR_TYPE_PARAM, null, true, CommonParams.SUB_DIR_TYPE_DESC); options.addOption(CommonParams.MAX_NUM_REC_PARAM, null, true, CommonParams.MAX_NUM_REC_DESC); options.addOption(CommonParams.SOLR_FILE_NAME_PARAM, null, true, CommonParams.SOLR_FILE_NAME_DESC); options.addOption(CommonParams.OUT_INDEX_PARAM, null, true, CommonParams.OUT_MINDEX_DESC); CommandLineParser parser = new org.apache.commons.cli.GnuParser(); try {//from www. j a v a 2 s . c o m CommandLine cmd = parser.parse(options, args); String rootDir = null; rootDir = cmd.getOptionValue(CommonParams.ROOT_DIR_PARAM); if (null == rootDir) Usage("Specify: " + CommonParams.ROOT_DIR_DESC, options); String outputDirName = cmd.getOptionValue(CommonParams.OUT_INDEX_PARAM); if (null == outputDirName) Usage("Specify: " + CommonParams.OUT_MINDEX_DESC, options); String subDirTypeList = cmd.getOptionValue(CommonParams.SUB_DIR_TYPE_PARAM); if (null == subDirTypeList || subDirTypeList.isEmpty()) Usage("Specify: " + CommonParams.SUB_DIR_TYPE_DESC, options); String solrFileName = cmd.getOptionValue(CommonParams.SOLR_FILE_NAME_PARAM); if (null == solrFileName) Usage("Specify: " + CommonParams.SOLR_FILE_NAME_DESC, options); int maxNumRec = Integer.MAX_VALUE; String tmp = cmd.getOptionValue(CommonParams.MAX_NUM_REC_PARAM); if (tmp != null) { try { maxNumRec = Integer.parseInt(tmp); if (maxNumRec <= 0) { Usage("The maximum number of records should be a positive integer", options); } } catch (NumberFormatException e) { Usage("The maximum number of records should be a positive integer", options); } } File outputDir = new File(outputDirName); if (!outputDir.exists()) { if (!outputDir.mkdirs()) { System.out.println("couldn't create " + outputDir.getAbsolutePath()); System.exit(1); } } if (!outputDir.isDirectory()) { System.out.println(outputDir.getAbsolutePath() + " is not a directory!"); System.exit(1); } if (!outputDir.canWrite()) { System.out.println("Can't write to " + outputDir.getAbsolutePath()); System.exit(1); } String subDirs[] = subDirTypeList.split(","); int docNum = 0; // No English analyzer here, all language-related processing is done already, // here we simply white-space tokenize and index tokens verbatim. Analyzer analyzer = new WhitespaceAnalyzer(); FSDirectory indexDir = FSDirectory.open(outputDir); IndexWriterConfig indexConf = new IndexWriterConfig(analyzer.getVersion(), analyzer); System.out.println("Creating a new Lucene index, maximum # of docs to process: " + maxNumRec); indexConf.setOpenMode(OpenMode.CREATE); IndexWriter indexWriter = new IndexWriter(indexDir, indexConf); for (int subDirId = 0; subDirId < subDirs.length && docNum < maxNumRec; ++subDirId) { String inputFileName = rootDir + "/" + subDirs[subDirId] + "/" + solrFileName; System.out.println("Input file name: " + inputFileName); BufferedReader inpText = new BufferedReader( new InputStreamReader(CompressUtils.createInputStream(inputFileName))); String docText = XmlHelper.readNextXMLIndexEntry(inpText); for (; docText != null && docNum < maxNumRec; docText = XmlHelper.readNextXMLIndexEntry(inpText)) { ++docNum; Map<String, String> docFields = null; Document luceneDoc = new Document(); try { docFields = XmlHelper.parseXMLIndexEntry(docText); } catch (Exception e) { System.err.println(String.format("Parsing error, offending DOC #%d:\n%s", docNum, docText)); System.exit(1); } String id = docFields.get(UtilConst.TAG_DOCNO); if (id == null) { System.err.println(String.format("No ID tag '%s', offending DOC #%d:\n%s", UtilConst.TAG_DOCNO, docNum, docText)); } luceneDoc.add(new StringField(UtilConst.TAG_DOCNO, id, Field.Store.YES)); for (Map.Entry<String, String> e : docFields.entrySet()) if (!e.getKey().equals(UtilConst.TAG_DOCNO)) { luceneDoc.add(new TextField(e.getKey(), e.getValue(), Field.Store.YES)); } indexWriter.addDocument(luceneDoc); if (docNum % 1000 == 0) System.out.println("Indexed " + docNum + " docs"); } System.out.println("Indexed " + docNum + " docs"); } indexWriter.commit(); indexWriter.close(); } catch (ParseException e) { Usage("Cannot parse arguments", options); } catch (Exception e) { System.err.println("Terminating due to an exception: " + e); System.exit(1); } }
From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.plain.PlainCorpusBuilder.java
License:Open Source License
private void initializeIndex(String index) { final File docDir = new File(index); Date start = new Date(); try {//w w w .j ava2s . c om System.out.println("Indexing to directory '" + docDir.getAbsolutePath() + "'..."); Directory dir = FSDirectory.open(new File(index)); Analyzer analyzer = new AnalyzerSpanish(Version.LUCENE_40); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer); //// in the directory, removing any // previously indexed documents: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // iwc.setRAMBufferSizeMB(1024.0); writer = new IndexWriter(dir, iwc); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:edu.ehu.galan.lite.algorithms.ranked.supervised.tfidf.corpus.wikipedia.WikiCorpusBuilder.java
License:Open Source License
private void initializeIndex(String index) { final File docDir = new File(index); Date start = new Date(); try {//from w ww . j a v a 2s. c o m System.out.println("Indexing to directory '" + docDir.getAbsolutePath() + "'..."); Directory dir = FSDirectory.open(new File(index)); Analyzer analyzer = new AnalyzerEnglish(Version.LUCENE_40); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer); //// in the directory, removing any // previously indexed documents: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // iwc.setRAMBufferSizeMB(1024.0); writer = new IndexWriter(dir, iwc); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:edu.rpi.tw.linkipedia.search.indexing.EntityIndexer.java
License:Open Source License
public void createIndex() { try {/*from w w w.j a v a2 s .c o m*/ Analyzer stdAnalyzer = DefaultAnalyzer.getAnalyzer(); PayloadEncoder encoder = new FloatEncoder(); EntropyAnalyzer entropyAnalyzer = new EntropyAnalyzer(encoder); Map<String, Analyzer> myAnalyzerMap = new HashMap<String, Analyzer>(); myAnalyzerMap.put("related_object", entropyAnalyzer); myAnalyzerMap.put("label", entropyAnalyzer); myAnalyzerMap.put("defaultLabel", entropyAnalyzer); myAnalyzerMap.put("analyzedLabel", stdAnalyzer); PerFieldAnalyzerWrapper MyAnalyzer = new PerFieldAnalyzerWrapper(stdAnalyzer, myAnalyzerMap); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, MyAnalyzer); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); iwc.setRAMBufferSizeMB(4096); iwc.setMaxThreadStates(36); iwc.setSimilarity(new MySimilarity()); Directory dir = FSDirectory.open(new File(indexDirectory)); IndexWriter writer = new IndexWriter(dir, iwc); System.out.println("Indexing to directory '" + indexDirectory + "'..."); indexDocs(writer, new File(sourceDirectory)); System.out.println("Optimizing..."); writer.close(); System.out.println("Finished Indexing"); } catch (Exception e) { e.printStackTrace(); } }
From source file:edu.rpi.tw.linkipedia.search.indexing.EntityIndexUpdater.java
License:Open Source License
public void updateIndex() { try {/*from w w w .j a v a2 s .c o m*/ Analyzer stdAnalyzer = DefaultAnalyzer.getAnalyzer(); PayloadEncoder encoder = new FloatEncoder(); EntropyAnalyzer entropyAnalyzer = new EntropyAnalyzer(encoder); Map<String, Analyzer> myAnalyzerMap = new HashMap<String, Analyzer>(); myAnalyzerMap.put("related_object", entropyAnalyzer); myAnalyzerMap.put("label", entropyAnalyzer); myAnalyzerMap.put("defaultLabel", entropyAnalyzer); myAnalyzerMap.put("analyzedLabel", stdAnalyzer); PerFieldAnalyzerWrapper MyAnalyzer = new PerFieldAnalyzerWrapper(stdAnalyzer, myAnalyzerMap); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, MyAnalyzer); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); iwc.setRAMBufferSizeMB(4096); iwc.setMaxThreadStates(36); iwc.setSimilarity(new MySimilarity()); Directory dir = FSDirectory.open(new File(indexDirectory)); IndexWriter writer = new IndexWriter(dir, iwc); System.out.println("Update directory '" + indexDirectory + "'..."); indexDocs(writer, new File(sourceDirectory)); System.out.println("Optimizing..."); writer.close(); System.out.println("Finished Updating"); } catch (Exception e) { e.printStackTrace(); } }
From source file:edu.rpi.tw.linkipedia.search.indexing.SurfaceFormIndexUpdater.java
License:Open Source License
public void updateIndex() { try {//from w w w.ja va2 s . co m Analyzer stdAnalyzer = DefaultAnalyzer.getAnalyzer(); PayloadEncoder encoder = new FloatEncoder(); EntropyAnalyzer entropyAnalyzer = new EntropyAnalyzer(encoder); Map<String, Analyzer> myAnalyzerMap = new HashMap<String, Analyzer>(); myAnalyzerMap.put("label", entropyAnalyzer); myAnalyzerMap.put("analyzedLabel", stdAnalyzer); PerFieldAnalyzerWrapper MyAnalyzer = new PerFieldAnalyzerWrapper(stdAnalyzer, myAnalyzerMap); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, MyAnalyzer); iwc.setSimilarity(new MySimilarity()); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); Directory dir = FSDirectory.open(new File(indexDirectory)); IndexWriter writer = new IndexWriter(dir, iwc); System.out.println("Indexing to directory '" + indexDirectory + "'..."); indexDocs(writer, new File(sourceDirectory)); System.out.println("Optimizing..."); writer.close(); System.out.println("Finished Indexing"); } catch (Exception e) { e.printStackTrace(); } }
From source file:edu.stanford.muse.index.Indexer.java
License:Apache License
private IndexWriter openIndexWriter(Directory dir) throws IOException { //IndexWriterConfig config = new IndexWriterConfig(MUSE_LUCENE_VERSION, null); //IndexWriter writer = new IndexWriter(dir, null, IndexWriter.MaxFieldLength.UNLIMITED); IndexWriterConfig iwc = new IndexWriterConfig(LUCENE_VERSION, analyzer); iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); return new IndexWriter(dir, iwc); // , new IndexWriter.MaxFieldLength(250000)); }
From source file:edu.uci.ics.cs221wiki.IndexFiles.java
License:Apache License
/** Index all text files under a directory. */ public static void main(String[] args) { String usage = "java org.apache.lucene.demo.IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles"; String indexPath = "index"; String docsPath = null;//from www . ja v a 2 s . co m boolean create = true; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1]; i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } else if ("-update".equals(args[i])) { create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }