List of usage examples for org.apache.lucene.index IndexWriterConfig setOpenMode
public IndexWriterConfig setOpenMode(OpenMode openMode)
From source file:it.drwolf.ridire.index.sketch.SketchCreatorManager.java
License:Apache License
public void closeIndex() { try {/*from ww w . j a va 2 s. com*/ if (this.sketchCreatorData.getIndexWriter() != null) { this.sketchCreatorData.getIndexWriter().close(); } else { String indexLocation = this.entityManager .find(Parameter.class, Parameter.SKETCH_INDEX_LOCATION.getKey()).getValue(); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_33, new KeywordAnalyzer()); indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); IndexWriter indexWriter = new IndexWriter(new MMapDirectory(new File(indexLocation)), indexWriterConfig); if (indexWriter != null) { indexWriter.close(); } } } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (LockObtainFailedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:it.drwolf.ridire.index.sketch.SketchCreatorManager.java
License:Apache License
private void doCreateSketches(String indexLocation) { IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_33, new KeywordAnalyzer()); indexWriterConfig.setOpenMode(OpenMode.CREATE_OR_APPEND); try {/*from ww w .ja v a2 s.c o m*/ IndexWriter indexWriter = new IndexWriter(new MMapDirectory(new File(indexLocation)), indexWriterConfig); this.sketchCreatorData.setIndexWriter(indexWriter); } catch (CorruptIndexException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (LockObtainFailedException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } this.sketchCreatorData.setProcessNumber(this.processNumber); this.sketchCreatorData.setWorkingDir(this.workingDir); this.sketchCreator.createSketches(this.sketchCreatorData); }
From source file:it.unipd.dei.ims.lucene.clef.applications.BuildIndex.java
License:Apache License
public static void main(String[] args) { Properties properties = new Properties(); InputStream input = null;//w w w . ja v a2s. c om try { if (System.getProperty("properties.path") != null) { input = new FileInputStream(System.getProperty("properties.path")); properties.load(input); } else { logger.info("Loading default property file [resources/lucene-clef.properties]"); ClassLoader loader = Thread.currentThread().getContextClassLoader(); input = loader.getResourceAsStream("lucene-clef.properties"); properties.load(input); } } catch (IOException ex) { ex.printStackTrace(); } finally { if (input != null) { try { input.close(); } catch (IOException e) { e.printStackTrace(); } } } properties.putAll(System.getProperties()); String language = properties.getProperty("language"); String stemmer = properties.getProperty("stemmer"); String stopsetType = properties.getProperty("stopset.type"); String stopsetPath = null; if (stopsetType.equalsIgnoreCase("CUSTOM")) { stopsetPath = properties.getProperty("stopset.path"); } String corporaRootPath = properties.getProperty("corpora.path"); int corpusSize = Integer.parseInt(properties.getProperty(language + ".corpus.size")); String[] corpora = properties.getProperty(language + ".corpora").split(";"); TrecContentSource trecContentSource = new TrecContentSource(); try { Properties configProps = new Properties(); configProps.setProperty("trec.doc.parser", "it.unipd.dei.ims.lucene.clef.parser.ClefDocParser"); configProps.setProperty("content.source.verbose", "false"); configProps.setProperty("content.source.forever", "false"); configProps.setProperty("content.source.excludeIteration", "true"); configProps.setProperty("work.dir", new File(".").getAbsolutePath()); configProps.setProperty("language", language); configProps.setProperty("stemmer", stemmer); configProps.setProperty("stopset_type", stopsetType); configProps.setProperty("stopset_path", stopsetPath); // set lucene index directory Path indexPath = new File(properties.getProperty("index.path")).toPath(); Directory directory = new SimpleFSDirectory(indexPath); // indexing configuration CharArraySet stopset = AnalyzerFactory.createStopset(language, stopsetType, stopsetPath); Analyzer analyzer = AnalyzerFactory.createAnalyzer(language, stemmer, stopset); IndexWriterConfig conf = new IndexWriterConfig(analyzer); conf.setSimilarity(new BM25Similarity()); conf.setOpenMode(IndexWriterConfig.OpenMode.CREATE); IndexWriter indexWriter = new IndexWriter(directory, conf); boolean storePositions = true; FieldType bodyFieldType = new FieldType(); if (storePositions) { bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); } else { bodyFieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS); } for (String corpus : corpora) { int docCount = 0; logger.info("... indexing corpus " + corpus); try { configProps.setProperty("docs.dir", corporaRootPath + "/" + corpus); configProps.setProperty("content.source.encoding", properties.getProperty(corpus + ".encoding", "UTF-8")); trecContentSource.setConfig(new Config(configProps)); DocData docData = new DocData(); while ((docData = trecContentSource.getNextDocData(docData)) != null) { docCount++; // System.out.println("ID: "+docData.getName()); // System.out.println("BODY: "+docData.getBody()); Document doc = getDocumentFromDocData(docData, bodyFieldType); indexWriter.addDocument(doc); } } catch (NoMoreDataException e) { logger.info("... " + docCount + " documents indexed for corpus " + corpus + "\n"); } } indexWriter.close(); DirectoryReader ireader = DirectoryReader.open(directory); if (corpusSize != ireader.numDocs()) { throw new Exception("The number of documents indexed is " + ireader.numDocs() + ", but should be " + corpusSize); } logger.info("Number of documents: " + ireader.numDocs()); } catch (IOException e) { e.printStackTrace(); } catch (Exception e) { e.printStackTrace(); } }
From source file:javatools.webapi.LuceneIndexFiles.java
License:Apache License
public static void indexDelimitedFile(String file, int indexColumn, int pathColumn, String dirIndex) { Date start = new Date(); try {//w ww. jav a2 s .c om if ((new File(dirIndex)).exists()) { (new File(dirIndex)).delete(); } Directory dir = FSDirectory.open(new File(dirIndex)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); iwc.setOpenMode(OpenMode.CREATE); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, file, indexColumn, pathColumn); // NOTE: if you want to maximize search performance, // you can optionally call optimize here. This can be // a costly operation, so generally it's only worth // it when your index is relatively static (ie you're // done adding documents to it): // // writer.optimize(); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:javatools.webapi.LuceneIndexFiles.java
License:Apache License
public static void indexDelimitedFile(String file, int indexColumn, int[] pathColumn, String dirIndex) { Date start = new Date(); try {/* ww w .j a v a2 s. co m*/ if ((new File(dirIndex)).exists()) { (new File(dirIndex)).delete(); } Directory dir = FSDirectory.open(new File(dirIndex)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); iwc.setOpenMode(OpenMode.CREATE); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, file, indexColumn, pathColumn); // NOTE: if you want to maximize search performance, // you can optionally call optimize here. This can be // a costly operation, so generally it's only worth // it when your index is relatively static (ie you're // done adding documents to it): // // writer.optimize(); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:javatools.webapi.LuceneIndexFiles.java
License:Apache License
public static void indexDelimitedFile(String file, int[] indexColumn, int[] pathColumn, String dirIndex) { Date start = new Date(); try {//from www. j a v a2 s . co m if ((new File(dirIndex)).exists()) { (new File(dirIndex)).delete(); } Directory dir = FSDirectory.open(new File(dirIndex)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); iwc.setOpenMode(OpenMode.CREATE); // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, file, indexColumn, pathColumn); // NOTE: if you want to maximize search performance, // you can optionally call optimize here. This can be // a costly operation, so generally it's only worth // it when your index is relatively static (ie you're // done adding documents to it): // // writer.optimize(); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:javatools.webapi.LuceneIndexFiles.java
License:Apache License
/** Index all text files under a directory. */ public static void main2(String[] args) { String usage = "java org.apache.lucene.demo.IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" // TODO: Change the link with every release (or: fill in some less error-prone alternative here...) + "See http://lucene.apache.org/java/3_1/demo.html for details."; String indexPath = "index"; String docsPath = null;//ww w . ja va 2s . c o m boolean create = true; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1]; i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } else if ("-update".equals(args[i])) { create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call optimize here. This can be // a costly operation, so generally it's only worth // it when your index is relatively static (ie you're // done adding documents to it): // // writer.optimize(); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:kbp2013.index.IndexSourceCorpus.java
License:Open Source License
public static void main(String[] args) throws IOException { initializeFromDefault();//ww w . j a v a2s . c o m int managed = 0; // counter to count idents int counted = 0; // when to display int tocount = 10; System.out.println("Indexing to directory '" + luceneIndex + "'..."); INDEX_DIR = new File(luceneIndex); if (INDEX_DIR.exists() && create == 1) { System.out.println("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first"); System.exit(1); } Directory dir = FSDirectory.open(new File(luceneIndex)); // Open lucene stuff Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer); // iwc.setRAMBufferSizeMB(1024); // http://wiki.apache.org/lucene-java/ImproveIndexingSpeed iwc.setMaxThreadStates(100); // manage append mode if (create == 0) { // add new document to an existing index iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); // if appending, checkindex if (checkindex == 1) { System.out.println("Checking index ..."); CheckIndex ci = new CheckIndex(dir); ci.checkIndex(); System.out.println("End of Checking index"); } } else { iwc.setOpenMode(OpenMode.CREATE); } // build writer IndexWriter writer = new IndexWriter(dir, iwc); final File docDir = new File(home); System.out.println("Indexing directory '" + home + "'..."); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } // read all the files BufferedReader reader = new BufferedReader(new FileReader(homelist)); // read line by line each file name String text = ""; boolean verbose = true; while ((text = reader.readLine()) != null) { String filename = home + text; final File testFile = new File(filename); // verbose - remove from one line files if (verbose) { System.out.println("---V-->" + "Indexing content of " + filename); } if (testFile.isFile() && !filename.contains("\\.gz")) { // open file and read FileReader fread = new FileReader(filename); BufferedReader readerDoc = new BufferedReader(fread); // initialize variable for loop String fileRef = ""; // the line containing the document id String fromfile = ""; // the first reader for all the file String textdoc = ""; // inside the file the reader for the document while ((fromfile = readerDoc.readLine()) != null) { if (fromfile.toUpperCase().contains("<DOC ID=") || fromfile.toUpperCase().contains("<DOC>")) { String fromdoc = fromfile; // begin to index the DOCID (to keep good offset for collection of mention) textdoc = fromfile; // initialize variable and keep the first line // accumulate all the content while (!fromdoc.toUpperCase().contains("</DOC>")) { // collect the doc id // store the current file ref // it can come : // - from the last fromfile (first iteration) // - from a current iteration of fromdoc (any iteration) if (fromdoc.toUpperCase().contains("<DOC ID=") || fromdoc.toUpperCase().contains("<DOCID>")) { fileRef = fromdoc; } // accumulate the complete document for later offset reading of mention fromdoc = readerDoc.readLine(); textdoc = textdoc + "\n" + fromdoc; } // locate id // 2 forms // <DOCID> ALHURRA_NEWS13_ARB_20050412_130100-2.LDC2006E92 </DOCID> // <doc id="bolt-eng-DF-183-195681-7948494"> // form 1 String idStr = fileRef; if (idStr.contains("<DOCID>")) { idStr = idStr.replace("<DOCID>", ""); idStr = idStr.replace("</DOCID>", ""); idStr = idStr.replace(" ", ""); // retire l'espace } if (idStr.contains("<DOC id=")) { idStr = idStr.replace("<DOC id=\"", ""); idStr = idStr.replaceAll("\".+>$", ""); //idStr = idStr.replaceAll("\">$", ""); } // lower case ->new corpus of LDC /* if (idStr.contains("<docid>")){ idStr = idStr.replace("<docid>", ""); idStr = idStr.replace("</docid>", ""); idStr = idStr.replace(" ", ""); // retire l'espace } if (idStr.contains("<doc id=")){ idStr = idStr.replace("<doc id=\"", ""); idStr = idStr.replaceAll("\".+>$", ""); // idStr = idStr.replaceAll("\">$", ""); } */ indexDocs(writer, idStr, textdoc); // display info managed++; counted++; // verbose remove for 1 doc files if (verbose) { System.out.println( "---V-->" + counted + ":" + filename + ":" + idStr + ":" + textdoc.length()); } if (managed > tocount) { managed = 0; System.out.println(counted + ":" + filename + ":------>" + idStr); // clean the writer //writer.waitForMerges(); //writer.forceMergeDeletes(); writer.commit(); } } // end of if } // end of while readerDoc.close(); fread.close(); } else { System.out.println(counted + ":Non lisible ou non requis:" + filename); } } // close properly the index writer // !! Caution !! in case of error, if this is not closed, the index is corrupted // and has to be regenerated writer.close(); reader.close(); }
From source file:kbp2013.index.IndexSourceCorpus_v2.java
License:Open Source License
/** * //from w w w.jav a 2s . c o m * * * @param args * @throws IOException * @throws FileNotFoundException * @throws ClassNotFoundException * @throws Exception */ public static void main(String[] args) throws IOException, FileNotFoundException, ClassNotFoundException, Exception { Date start = new Date(); Directory targetIndexDir = FSDirectory.open(indexDir); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer); if (indexDir.exists() == false) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { System.err.println("Adding files to existing index: '" + indexDir); // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } //set ram buffer size (optional) iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(targetIndexDir, iwc); System.err.println("Indexing to directory '" + indexDir + "'..."); int docCount = 1; BufferedReader reader = new BufferedReader(new FileReader(inputLstFile)); String currentDocPath; while ((currentDocPath = reader.readLine()) != null) { currentDocPath = currentDocPath.trim(); System.err.println("Processing file: " + currentDocPath); //Processing each gzip file InputStream fileInputStream = new BufferedInputStream(new FileInputStream(currentDocPath)); InputStreamReader streamReader; GZIPInputStream zipReader = null; if (currentDocPath.endsWith(".gz")) { //case when the file to index is a gzip file zipReader = new GZIPInputStream(fileInputStream); streamReader = new InputStreamReader(zipReader); } else { streamReader = new InputStreamReader(fileInputStream); } BufferedReader br = new BufferedReader(streamReader); String docTitle = ""; String fileContent = ""; String line; String docId = ""; //String rawCnt = ""; StringBuilder pageBuffer = new StringBuilder(); //raw content with the "\n" StringBuilder rawPageBuffer = new StringBuilder(); while ((line = br.readLine()) != null) { if (StringUtils.contains(line.toLowerCase(), "</doc>") == true) { pageBuffer.append(line).append(" "); rawPageBuffer.append(line + "MY_CUSTOM_SPACE"); //rawCnt = rawCnt + "MY_CUSTOM_SPACE" + line; if (pageBuffer.length() > 0) { fileContent = pageBuffer.toString().replaceAll(" ", " "); docId = extractDocId(fileContent); //get the title of the page docTitle = extractTitle(fileContent); //get the content of the page String content = extractContent(fileContent); String rawContent = extractRawContent(rawPageBuffer.toString()); indexDocument(writer, docId, content, docTitle, rawContent); System.err.println("Processed " + docCount + " documents"); docCount++; } //reset buffer pageBuffer = new StringBuilder(); rawPageBuffer = new StringBuilder(); //rawCnt = ""; } pageBuffer.append(line).append(" "); rawPageBuffer.append(line + "MY_CUSTOM_SPACE"); } fileInputStream.close(); if (currentDocPath.endsWith(".gz") && zipReader != null) { zipReader.close(); } streamReader.close(); } reader.close(); writer.close(); Date end = new Date(); System.err.println(end.getTime() - start.getTime() + " total milliseconds"); }
From source file:kbp2013.index.IndexWikipediaCorpus.java
License:Open Source License
public static void main(String[] args) throws IOException { initializeFromDefault();//from ww w . j av a2 s . c o m int managed = 0; // counter to count idents int counted = 0; // when to display int tocount = 1000; int saved = 0; System.out.println("Indexing Wikipedia Dump to directory '" + wikiluceneIndex + "'..."); INDEX_DIR = new File(wikiluceneIndex); if (INDEX_DIR.exists() && create == 1) { System.out.println("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first"); System.exit(1); } Directory dir = FSDirectory.open(new File(wikiluceneIndex)); // Open lucene stuff Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer); // configure Lucene Stuff iwc.setMaxThreadStates(100); // manage append mode if (create == 0) { // add new document to an existing index iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); // if appending, check index if (checkindex == 1) { System.out.println("Checking index ..."); CheckIndex ci = new CheckIndex(dir); ci.checkIndex(); System.out.println("End of Checking index"); } } else { iwc.setOpenMode(OpenMode.CREATE); } // build writer IndexWriter writer = new IndexWriter(dir, iwc); // -------------------------- // // Open the Wikipedia Dump // //--------------------------- BufferedReader reader = new BufferedReader(new FileReader(wikidump)); // read the domains String text = ""; ArrayList domain = new ArrayList(); // the content retrieved according to the page key while (!text.contains("</siteinfo>")) { text = reader.readLine(); if (text.contains("<namespace key=") && !text.contains("<namespace key=\"0")) { String thisnamespace = text.replaceAll("<namespace key=[^>]+>", ""); thisnamespace = thisnamespace.replaceAll("</namespace>", ""); thisnamespace = thisnamespace.replaceAll("^[ ]+", ""); thisnamespace = thisnamespace + ":"; if (!thisnamespace.contentEquals("")) { domain.add(thisnamespace); System.out.println("Registered domain:" + thisnamespace + ";"); } } } System.out.println("--------------------------------"); // read the pages while ((text = reader.readLine()) != null) { String textdoc = ""; // inside the file, the reader for the document String pagename = ""; boolean tosave = true; // beginning of a page // accumulate if (text.contains("<page>")) { textdoc = text; while (!text.contains("</page>")) { text = reader.readLine(); textdoc = textdoc + text; if (text.contains("<title>")) { pagename = text.replaceAll("<title>", ""); pagename = pagename.replaceAll("</title>", ""); pagename = pagename.replaceAll("[ ]{2,10}", ""); //System.out.println("Page:" + pagename); } // safety } // after page reading index document // verify if document // A) is not a redirect // B) is not from a domain for (int a = 0; a < domain.size(); a++) { String domaintosearch = domain.get(a).toString(); if (pagename.toLowerCase().contains(domaintosearch.toLowerCase())) { System.out.println("Specific page:" + pagename); tosave = false; } } /* if (textdoc.contains("[A-Za-z ]+:")){ System.out.println("Specific page domain:" + pagename); tosave = false; }*/ if (textdoc.contains("#REDIRECT")) { // System.out.println("Redirect:" + pagename); tosave = false; } if (tosave) { saved++; indexDocs(writer, pagename, textdoc); } // display info managed++; counted++; if (managed > tocount) { managed = 0; System.out.println(counted + ":" + saved + ":" + pagename + ":------>" + textdoc.length()); // System.out.println(textdoc); writer.commit(); } } } // end while // close properly the index writer // !! Caution !! in case of error, if this is not closed, the index is corrupted // and has to be regenerated writer.close(); reader.close(); }