List of usage examples for org.apache.lucene.index IndexWriter addDocument
public long addDocument(Iterable<? extends IndexableField> doc) throws IOException
From source file:driver651.Driver651.java
License:Apache License
public static void main(String[] args) throws Exception { //@FieldCacheImpl.java int threadNo = Integer.parseInt(args[0]); // 138 vs 179 507 (original) all-threads-one-cache // 1295 1779 one-thread-one-cache RAMDirectory directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, new WhitespaceAnalyzer(), true); int theInt = Integer.MAX_VALUE; for (int j = 0; j < NUM_FIELDS; j++) { for (int i = 0; i < NUM_DOCS; i++) { Document doc = new Document(); doc.add(new Field("theField" + j, String.valueOf(theInt--), Field.Store.NO, Field.Index.UN_TOKENIZED));// notice the field "theFieldj" writer.addDocument(doc); }/*w w w. j a v a2s . c o m*/ } writer.close(); reader = IndexReader.open(directory); FieldCacheImpl cache = new FieldCacheImpl();// move it out of the loop, then you get the all-threads-one-cache scenario! WorkerThread[] workers = new WorkerThread[threadNo]; for (int i = 0; i < threadNo; i++) { workers[i] = new WorkerThread(cache); } long start = System.currentTimeMillis(); for (int i = 0; i < threadNo; i++) { workers[i].start(); } for (int i = 0; i < threadNo; i++) { workers[i].join(); } long end = System.currentTimeMillis(); System.out.println("duration: " + (end - start)); }
From source file:dynamicrefactoring.interfaz.wizard.search.internal.SearchableTypeIndexer.java
License:Open Source License
/** * Genera un indice en el directorio pasado. * /*from w w w . jav a 2 s. c om*/ * @param elementType * tipo de searchable cuyos elementos se van a indizar * @param directory * directorio sobre el que se generara el indice * * @return numero de elementos indizados * @throws CorruptIndexException * * @throws IOException */ @Override public int index(SearchableType elementType, Directory directory) throws IOException { final IndexWriter writer = createWriter(directory); int numIndexed = 0; JavadocReader javadocReader = EclipseBasedJavadocReader.INSTANCE; for (String fullyQualifiedName : elementType.getClassesToIndex()) { String text = javadocReader.getTypeJavaDocAsPlainText(fullyQualifiedName); Document doc = getDocument(fullyQualifiedName, text); writer.addDocument(doc); numIndexed++; } close(writer); return numIndexed; }
From source file:edu.albany.ir.example.IndexFiles.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is * given, recurses over files and directories found under the given * directory.// w ww .j a va 2 s.co m * * NOTE: This method indexes one document per input file. This is slow. For * good throughput, put multiple documents into your input file(s). An * example of this is in the benchmark module, which can create "line doc" * files, one document per line, using the <a href= * "../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer * Writer to the index where the given file/dir info will be * stored * @param file * The file to index, or the directory to recurse into to find * files to index * @throws IOException */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this // exception with an "access denied" message // checking if the file can be read doesn't help fnfe.printStackTrace(); return; } try { // our code // ********************************************************* String record = null; int a, b, stringNum = 0, i = 0; // String[] docContents = new String[1000]; // String[] docNos = new String[10000]; String docName = null; // make a new, empty document Document doc = new Document(); BufferedReader reader = new BufferedReader(new InputStreamReader(fis)); record = new String(); while ((record = reader.readLine()) != null) { a = record.lastIndexOf("<DOCNO>"); b = record.indexOf("</DOCNO>"); if (a >= 0 && b > 0) // if this line contains the DOCNO { stringNum++; // docNos[stringNum] = record.substring(a+7,b-1); docName = record.substring(a + 7, b).trim(); // add a document if (stringNum >= 1) { // index previous document if (stringNum >= 2) writer.addDocument(doc); // start new document doc = new Document(); // doc.add(new Field("path", file.getPath()+ // "/"+docName, // Add the path of the file as a field named // "path". Use a // field that is indexed (i.e. searchable), but // don't tokenize // the field into separate words and don't index // term frequency // or positional information: Field pathField = new Field("path", docName, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS); pathField.setOmitTermFreqAndPositions(true); doc.add(pathField); // doc.add(new Field("path", docName, // Field.Store.YES, // Field.Index.UN_TOKENIZED)); // System.out.println("adding " + // file.getPath()+ "/"+docName); System.out.println("adding " + docName); // Add the last modified date of the file a // field named "modified". // Use a NumericField that is indexed (i.e. // efficiently filterable with // NumericRangeFilter). This indexes to // milli-second resolution, which // is often too fine. You could instead create a // number based on // year/month/day/hour/minutes/seconds, down the // resolution you require. // For example the long value 2011021714 would // mean // February 17, 2011, 2-3 PM. NumericField modifiedField = new NumericField("modified"); modifiedField.setLongValue(file.lastModified()); doc.add(modifiedField); // doc.add(new Field("modified", // DateField.timeToString(file.lastModified()), // Field.Store.YES, // Field.Index.UN_TOKENIZED)); } } else { doc.add(new Field("contents", record, Field.Store.YES, Field.Index.ANALYZED, // tokenized Field.TermVector.YES)); // docContents[stringNum] = docContents[stringNum] + // record; // add contents to document // Add the contents of the file to a field named // "contents". Specify a Reader, // so that the text of the file is tokenized and // indexed, but not stored. // Note that FileReader expects the file to be in // UTF-8 encoding. // If that's not the case searching for special // characters will fail. // doc.add(new Field("contents", new // BufferedReader(new InputStreamReader(fis, // "UTF-8")))); } a = 0; b = 0; } if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old // document can be there): System.out.println("adding " + docName); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have // been indexed) so // we use updateDocument instead to replace the old one // matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:edu.cmu.cs.in.search.HoopLuceneIndex.java
License:Apache License
/** * Indexes the given file using the given writer, or if a directory is given, * recurses over files and directories found under the given directory. * //w w w . j a v a 2 s . c om * NOTE: This method indexes one document per input file. This is slow. For good * throughput, put multiple documents into your input file(s). An example of this is * in the benchmark module, which can create "line doc" files, one document per line, * using the * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html" * >WriteLineDocTask</a>. * * @param writer Writer to the index where the given file/dir info will be stored * @param file The file to index, or the directory to recurse into to find files to index * @throws IOException If there is a low-level I/O error */ static void indexDocs(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { indexDocs(writer, new File(file, files[i])); } } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { // at least on windows, some temporary files raise this exception with an "access denied" message // checking if the file can be read doesn't help return; } try { // make a new, empty document Document doc = new Document(); // Add the path of the file as a field named "path". Use a // field that is indexed (i.e. searchable), but don't tokenize // the field into separate words and don't index term frequency // or positional information: Field pathField = new StringField("path", file.getPath(), Field.Store.YES); doc.add(pathField); // Add the last modified date of the file a field named "modified". // Use a LongField that is indexed (i.e. efficiently filterable with // NumericRangeFilter). This indexes to milli-second resolution, which // is often too fine. You could instead create a number based on // year/month/day/hour/minutes/seconds, down the resolution you require. // For example the long value 2011021714 would mean // February 17, 2011, 2-3 PM. doc.add(new LongField("modified", file.lastModified(), Field.Store.NO)); // Add the contents of the file to a field named "contents". Specify a Reader, // so that the text of the file is tokenized and indexed, but not stored. // Note that FileReader expects the file to be in UTF-8 encoding. // If that's not the case searching for special characters will fail. doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8")))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { // New index, so we just add the document (no old document can be there): System.out.println("adding " + file); writer.addDocument(doc); } else { // Existing index (an old copy of this document may have been indexed) so // we use updateDocument instead to replace the old one matching the exact // path, if present: System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } } finally { fis.close(); } } } }
From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazInfoIndexerAllCountries.java
License:Apache License
void indexGazatteer(BufferedReader br, IndexWriter iw) throws IOException { Document d = new Document(); StringField nfid = new StringField("ID", "0", Field.Store.YES); StringField name = new StringField("ORIGINAL-NAME", "", Field.Store.YES); IntField nfaltnames = new IntField("ALTNAME-COUNT", 0, Field.Store.YES); DoubleField nflong = new DoubleField("LONGTITUDE", 0.0, Field.Store.YES); DoubleField nfla = new DoubleField("LATITUDE", 0.0, Field.Store.YES); LongField nfpop = new LongField("POPULATION", 0, Field.Store.YES); StringField sfcountry = new StringField("COUNTRY-CODE", "", Field.Store.YES); StringField sfadm1 = new StringField("ADM1-CODE", "", Field.Store.YES); StringField sfadm2 = new StringField("ADM2-CODE", "", Field.Store.YES); StringField sfadm3 = new StringField("ADM3-CODE", "", Field.Store.YES); StringField sfadm4 = new StringField("ADM4-CODE", "", Field.Store.YES); StringField sffeatureclass = new StringField("FEATURE-CLASS", "", Field.Store.YES); StringField sffeature = new StringField("FEATURE", "", Field.Store.YES); StringField sftimezone = new StringField("TIMEZONE", "", Field.Store.YES); d.add(nfid);/*from w ww .j a v a 2 s .co m*/ d.add(name); d.add(nfaltnames); d.add(nflong); d.add(nfla); d.add(nfpop); d.add(sfcountry); d.add(sfadm1); d.add(sfadm2); d.add(sfadm3); d.add(sfadm4); d.add(sffeatureclass); d.add(sffeature); d.add(sftimezone); String line; int linen = 0; while ((line = br.readLine()) != null) { if (linen++ % 10000 == 0) System.out.println(linen + "\n" + line); String[] column = line.trim().split("\t"); // get other columns except for the location words String id = column[0]; String utfname = column[1]; String altnames = column[3]; String latitude = column[4]; String longtitude = column[5]; double dlong, dla; if (latitude == null) { dlong = 999; dla = 999; } else { dlong = Double.parseDouble(longtitude); dla = Double.parseDouble(latitude); } String featureclass = column[6]; String feature = column[7]; String country = column[8]; String population = column[14]; long longpop; if (population == null) longpop = -1; longpop = Long.parseLong(population); String timezone = column[17]; // To Do: set values to document d, and index it nfid.setStringValue(id);// 1 name.setStringValue(utfname); nfaltnames.setIntValue(altnames.split(",").length); nflong.setDoubleValue(dlong); nfla.setDoubleValue(dla); nfpop.setLongValue(longpop); sfcountry.setStringValue(country.toLowerCase()); sfadm1.setStringValue(column[10].toLowerCase()); sfadm2.setStringValue(column[11].toLowerCase()); sfadm3.setStringValue(column[12].toLowerCase()); sfadm4.setStringValue(column[13].toLowerCase()); sffeatureclass.setStringValue(featureclass); sffeature.setStringValue(feature); sftimezone.setStringValue(timezone);// 13 // add this new document. iw.addDocument(d); } }
From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazStringIndexerAllCountries.java
License:Apache License
void indexGazatteer(BufferedReader br, IndexWriter iw) throws IOException, InterruptedException { Document d = new Document(); StringField nfid = new StringField("ID", "", Field.Store.YES); StringField sforigin = new StringField("LOWERED_ORIGIN", "", Field.Store.YES); StringField normnws = new StringField("LOWERED-NO-WS", "", Field.Store.YES); TextField sfunigram = new TextField("UNIGRAM", "", Field.Store.YES); TextField sfbigram = new TextField("BIGRAM", "", Field.Store.YES); TextField sftrigram = new TextField("TRIGRAM", "", Field.Store.YES); StringField lang = new StringField("LANG", "", Field.Store.YES); d.add(nfid);// w w w . ja v a 2s . co m d.add(sforigin); d.add(normnws); d.add(sfunigram); d.add(sfbigram); d.add(sftrigram); String line; String[] column; String id, phrase, otherlang, oneOtherlang; int otherlangLength; int linen = 0; while ((line = br.readLine()) != null) { if (linen++ % 10000 == 0) System.out.println(linen + "\n" + line); column = line.trim().split("\t"); // get other columns except for the location words id = column[0]; phrase = column[1]; otherlang = column[3]; // To Do: set values to document d, and index it nfid.setStringValue(id);// 1 // id does not change, change the strings to index. phrase = phrase.toLowerCase(); sforigin.setStringValue(phrase);// 5 normnws.setStringValue(phrase.replaceAll(" ", "")); getIndexFeatures(phrase); sfunigram.setStringValue(getUnigram()); sfbigram.setStringValue(getBigram()); sftrigram.setStringValue(getTrigram()); lang.setStringValue(""); // add this new document. iw.addDocument(d); if (otherlang.length() == 0) continue; String otherlangs[] = otherlang.split(","); otherlangLength = otherlangs.length; for (int i = 0; i < otherlangLength; i++) { if (otherlangs[i].length() == 0) continue; // id does not change, change the strings to index. oneOtherlang = otherlangs[i].toLowerCase(); sforigin.setStringValue(oneOtherlang);// 5 normnws.setStringValue(oneOtherlang.replaceAll(" ", "")); getIndexFeatures(oneOtherlang); sfunigram.setStringValue(getUnigram()); sfbigram.setStringValue(getBigram()); sftrigram.setStringValue(getTrigram()); lang.setStringValue(""); // add this new document. iw.addDocument(d); } } }
From source file:edu.cmu.geolocator.resource.gazindexing.CollaborativeIndex.GazStringIndexerAltNames.java
License:Apache License
void indexAlterNames(BufferedReader br, IndexWriter iw) throws IOException, InterruptedException { Document d = new Document(); StringField nfid = new StringField("ID", "", Field.Store.YES); StringField sforigin = new StringField("LOWERED_ORIGIN", "", Field.Store.YES); StringField normnws = new StringField("LOWERED-NO-WS", "", Field.Store.YES); TextField sfunigram = new TextField("UNIGRAM", "", Field.Store.YES); TextField sfbigram = new TextField("BIGRAM", "", Field.Store.YES); TextField sftrigram = new TextField("TRIGRAM", "", Field.Store.YES); StringField lang = new StringField("LANG", "", Field.Store.YES); d.add(nfid);//w w w . jav a 2s . c o m d.add(sforigin); d.add(normnws); d.add(sfunigram); d.add(sfbigram); d.add(sftrigram); String line; String[] column; String id, langOrLink, phrase; int linen = 0; while ((line = br.readLine()) != null) { if (linen++ % 10000 == 0) System.out.println(linen + "\n" + line); column = line.trim().split("\t"); // get other columns except for the location words id = column[1]; langOrLink = column[2]; phrase = column[3]; // To Do: set values to document d, and index it nfid.setStringValue(id);// 1 phrase = phrase.toLowerCase(); sforigin.setStringValue(phrase);// 5 normnws.setStringValue(phrase.replaceAll(" ", "")); getIndexFeatures(phrase); sfunigram.setStringValue(getUnigram()); sfbigram.setStringValue(getBigram()); sftrigram.setStringValue(getTrigram()); lang.setStringValue(langOrLink); // add this new document. iw.addDocument(d); } }
From source file:edu.cmu.lti.huiying.ir.rangedsearch.TableIndexer.java
License:Apache License
public void indexExplodedXml(IndexWriter writer, File file) throws IOException { if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); if (files != null) { for (int i = 0; i < files.length; i++) { indexExplodedXml(writer, new File(file, files[i])); }/*from ww w . j a va2s . co m*/ } } else { FileInputStream fis = new FileInputStream(file); try { NumericFeatureGenerator nfg = new NumericFeatureGenerator(); if (this.xmlreader == null) { this.xmlreader = new XmlStAXReader(); } Article a = xmlreader.readArticleFromXml(file.getAbsolutePath()); for (Table t : a.tables) { for (Group g : t.groups) { for (Column col : g.columns) { // index columns Document coldoc = new Document(); ArrayList<Double> cfv = nfg.getFeatureVector(col.content); if (cfv.get(0) != null) { DoubleField intratio = new DoubleField("intratio", cfv.get(0), Field.Store.NO); coldoc.add(intratio); } if (cfv.get(1) != null) { DoubleField floatratio = new DoubleField("floatratio", cfv.get(1), Field.Store.NO); coldoc.add(floatratio); } if (cfv.get(3) != null) { DoubleField mean = new DoubleField("mean", cfv.get(3), Field.Store.NO); coldoc.add(mean); } if (cfv.get(4) != null) { DoubleField std = new DoubleField("std", cfv.get(4), Field.Store.NO); coldoc.add(std); } if (cfv.get(6) != null) { DoubleField min = new DoubleField("min", cfv.get(6), Field.Store.NO); coldoc.add(min); } if (cfv.get(7) != null) { DoubleField max = new DoubleField("max", cfv.get(7), Field.Store.NO); coldoc.add(max); } if (cfv.get(8) != null) { DoubleField acc = new DoubleField("acc", cfv.get(8), Field.Store.NO); coldoc.add(acc); } if (cfv.get(11) != null) { DoubleField colmag = new DoubleField("colmag", cfv.get(11), Field.Store.NO); coldoc.add(colmag); } StringField wholegroup = new StringField("wholegroup", g.toString(), Field.Store.YES); if (wholegroup.stringValue().getBytes().length > 32760) { wholegroup.setStringValue("Table too large..."); System.err.println( "table too large:" + wholegroup.stringValue().getBytes().length); } String headers = ""; if (col.headers != null) { for (Header hdr : col.headers) { headers += hdr.text.toLowerCase() + " "; } } TextField header = new TextField("headerkeywords", headers.trim(), Field.Store.NO); coldoc.add(header); coldoc.add(wholegroup); StringField fname = new StringField("filename", file.getAbsolutePath(), Field.Store.YES); coldoc.add(fname); StringField type = new StringField("type", "column", Field.Store.YES); coldoc.add(type); IntField bstart = new IntField("bytestart", col.content.get(0).byteStart, Field.Store.YES); IntField bend = new IntField("byteend", col.content.get(col.content.size() - 1).byteEnd, Field.Store.YES); String content = ""; for (edu.cmu.lti.huiying.domainclasses.Field f : col.content) content += f.text + "|"; StringField colcontent = new StringField("colcontent", content.substring(0, content.length() - 1), Field.Store.YES); coldoc.add(colcontent); coldoc.add(bstart); coldoc.add(bend); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { writer.addDocument(coldoc); totalDocAdded++; } else { writer.updateDocument(new Term("path", file.getPath()), coldoc); } for (edu.cmu.lti.huiying.domainclasses.Field f : col.content) { // Index single cell Document celldoc = new Document(); ArrayList<Double> fv = nfg.field2Features(f); if (fv.get(0) == 1 || fv.get(0) == 2) { try { DoubleField df = new DoubleField("value", fv.get(1), Field.Store.YES); celldoc.add(df); StringField textf = new StringField("text", f.text, Field.Store.YES); celldoc.add(textf); if (fv.get(2) != null & fv.get(2) != Double.NaN) { DoubleField errf = new DoubleField("error", fv.get(2), Field.Store.NO); celldoc.add(errf); } if (fv.get(5) != Double.NaN) { DoubleField magf = new DoubleField("cellmag", fv.get(5), Field.Store.NO); celldoc.add(magf); } if (fv.get(4) != null) { DoubleField pvalue = new DoubleField("cellpvalue", fv.get(4), Field.Store.NO); celldoc.add(pvalue); } StringField sf = new StringField("filename", file.getAbsolutePath(), Field.Store.YES); celldoc.add(sf); StringField ctype = new StringField("type", "cell", Field.Store.YES); celldoc.add(ctype); //StringField cwholegroup=new StringField("wholegroup", g.toString(), Field.Store.YES); //celldoc.add(cwholegroup); IntField cbstart = new IntField("bytestart", f.byteStart, Field.Store.YES); IntField cbend = new IntField("byteend", f.byteEnd, Field.Store.YES); celldoc.add(cbstart); celldoc.add(cbend); } catch (NullPointerException e) { e.printStackTrace(); System.out.println(f.text); } if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { writer.addDocument(celldoc); totalDocAdded++; } else { writer.updateDocument(new Term("path", file.getPath()), celldoc); } } } } } } } finally { fis.close(); } } } }
From source file:edu.cmu.lti.huiying.ir.rangedsearch.TableIndexer.java
License:Apache License
public void indexOffsetAnnotation(IndexWriter writer, File file) throws IOException { // do not try to index files that cannot be read if (file.canRead()) { if (file.isDirectory()) { String[] files = file.list(); // an IO error could occur if (files != null) { for (int i = 0; i < files.length; i++) { if (files[i].equals("NeuroScience.num.offset")) indexOffsetAnnotation(writer, new File(file, files[i])); }//from w w w .j av a 2 s .c o m } } else { FileInputStream fis; try { fis = new FileInputStream(file); } catch (FileNotFoundException fnfe) { return; } try { // make a new, empty document Document doc = new Document(); BufferedReader br = new BufferedReader(new InputStreamReader(fis, StandardCharsets.UTF_8)); String line = null; String filename = null; while ((line = br.readLine()) != null) { if (line.trim().length() == 0) { doc.add((new StringField("filename", filename, Field.Store.YES))); if (writer.getConfig().getOpenMode() == OpenMode.CREATE) { writer.addDocument(doc); } else { System.out.println("updating " + file); writer.updateDocument(new Term("path", file.getPath()), doc); } doc = new Document(); filename = null; continue; } String[] spl = line.split("\t"); doc.add(new DoubleField(spl[3], Double.parseDouble(spl[5]), Field.Store.YES)); if (filename == null) filename = spl[0]; } br.close(); } finally { fis.close(); } } } }
From source file:edu.cmu.lti.oaqa.knn4qa.apps.LuceneIndexer.java
License:Apache License
public static void main(String[] args) { Options options = new Options(); options.addOption(CommonParams.ROOT_DIR_PARAM, null, true, CommonParams.ROOT_DIR_DESC); options.addOption(CommonParams.SUB_DIR_TYPE_PARAM, null, true, CommonParams.SUB_DIR_TYPE_DESC); options.addOption(CommonParams.MAX_NUM_REC_PARAM, null, true, CommonParams.MAX_NUM_REC_DESC); options.addOption(CommonParams.SOLR_FILE_NAME_PARAM, null, true, CommonParams.SOLR_FILE_NAME_DESC); options.addOption(CommonParams.OUT_INDEX_PARAM, null, true, CommonParams.OUT_MINDEX_DESC); CommandLineParser parser = new org.apache.commons.cli.GnuParser(); try {/*ww w . j a v a 2 s.c om*/ CommandLine cmd = parser.parse(options, args); String rootDir = null; rootDir = cmd.getOptionValue(CommonParams.ROOT_DIR_PARAM); if (null == rootDir) Usage("Specify: " + CommonParams.ROOT_DIR_DESC, options); String outputDirName = cmd.getOptionValue(CommonParams.OUT_INDEX_PARAM); if (null == outputDirName) Usage("Specify: " + CommonParams.OUT_MINDEX_DESC, options); String subDirTypeList = cmd.getOptionValue(CommonParams.SUB_DIR_TYPE_PARAM); if (null == subDirTypeList || subDirTypeList.isEmpty()) Usage("Specify: " + CommonParams.SUB_DIR_TYPE_DESC, options); String solrFileName = cmd.getOptionValue(CommonParams.SOLR_FILE_NAME_PARAM); if (null == solrFileName) Usage("Specify: " + CommonParams.SOLR_FILE_NAME_DESC, options); int maxNumRec = Integer.MAX_VALUE; String tmp = cmd.getOptionValue(CommonParams.MAX_NUM_REC_PARAM); if (tmp != null) { try { maxNumRec = Integer.parseInt(tmp); if (maxNumRec <= 0) { Usage("The maximum number of records should be a positive integer", options); } } catch (NumberFormatException e) { Usage("The maximum number of records should be a positive integer", options); } } File outputDir = new File(outputDirName); if (!outputDir.exists()) { if (!outputDir.mkdirs()) { System.out.println("couldn't create " + outputDir.getAbsolutePath()); System.exit(1); } } if (!outputDir.isDirectory()) { System.out.println(outputDir.getAbsolutePath() + " is not a directory!"); System.exit(1); } if (!outputDir.canWrite()) { System.out.println("Can't write to " + outputDir.getAbsolutePath()); System.exit(1); } String subDirs[] = subDirTypeList.split(","); int docNum = 0; // No English analyzer here, all language-related processing is done already, // here we simply white-space tokenize and index tokens verbatim. Analyzer analyzer = new WhitespaceAnalyzer(); FSDirectory indexDir = FSDirectory.open(outputDir); IndexWriterConfig indexConf = new IndexWriterConfig(analyzer.getVersion(), analyzer); System.out.println("Creating a new Lucene index, maximum # of docs to process: " + maxNumRec); indexConf.setOpenMode(OpenMode.CREATE); IndexWriter indexWriter = new IndexWriter(indexDir, indexConf); for (int subDirId = 0; subDirId < subDirs.length && docNum < maxNumRec; ++subDirId) { String inputFileName = rootDir + "/" + subDirs[subDirId] + "/" + solrFileName; System.out.println("Input file name: " + inputFileName); BufferedReader inpText = new BufferedReader( new InputStreamReader(CompressUtils.createInputStream(inputFileName))); String docText = XmlHelper.readNextXMLIndexEntry(inpText); for (; docText != null && docNum < maxNumRec; docText = XmlHelper.readNextXMLIndexEntry(inpText)) { ++docNum; Map<String, String> docFields = null; Document luceneDoc = new Document(); try { docFields = XmlHelper.parseXMLIndexEntry(docText); } catch (Exception e) { System.err.println(String.format("Parsing error, offending DOC #%d:\n%s", docNum, docText)); System.exit(1); } String id = docFields.get(UtilConst.TAG_DOCNO); if (id == null) { System.err.println(String.format("No ID tag '%s', offending DOC #%d:\n%s", UtilConst.TAG_DOCNO, docNum, docText)); } luceneDoc.add(new StringField(UtilConst.TAG_DOCNO, id, Field.Store.YES)); for (Map.Entry<String, String> e : docFields.entrySet()) if (!e.getKey().equals(UtilConst.TAG_DOCNO)) { luceneDoc.add(new TextField(e.getKey(), e.getValue(), Field.Store.YES)); } indexWriter.addDocument(luceneDoc); if (docNum % 1000 == 0) System.out.println("Indexed " + docNum + " docs"); } System.out.println("Indexed " + docNum + " docs"); } indexWriter.commit(); indexWriter.close(); } catch (ParseException e) { Usage("Cannot parse arguments", options); } catch (Exception e) { System.err.println("Terminating due to an exception: " + e); System.exit(1); } }