Example usage for org.apache.lucene.document TextField TextField

List of usage examples for org.apache.lucene.document TextField TextField

Introduction

In this page you can find the example usage for org.apache.lucene.document TextField TextField.

Prototype

public TextField(String name, String value, Store store) 

Source Link

Document

Creates a new TextField with String value.

Usage

From source file:GUIFrame.java

private void AddDocument(IndexWriter indoWriter, String docID, String summary) throws IOException {
    Document doc = new Document();
    Field docIDField = new TextField("docID", docID, Field.Store.YES);
    Field sumField = new TextField("summary", summary, Field.Store.YES);

    doc.add(docIDField);//  w ww  .jav a  2 s.  c  o m
    doc.add(sumField);

    indoWriter.addDocument(doc);
}

From source file:MyServlet.java

private static void addDoc(IndexWriter w, String Class, String number, String time, String department)
        throws IOException {
    Document doc = new Document();
    // A text field will be tokenized
    doc.add(new TextField("Classes", Class, Field.Store.YES));
    // We use a string field for isbn because we don\'t want it tokenized
    doc.add(new StringField("Number", number, Field.Store.YES));
    doc.add(new StringField("Time", time, Field.Store.YES));
    doc.add(new StringField("Department", department, Field.Store.YES));
    w.addDocument(doc);/*from w  w w . j  a v a 2  s.  co m*/
}

From source file:MakeLuceneIndex.java

License:Apache License

/** Index all text files under a directory. 
 * @throws UnsupportedEncodingException 
 * @throws FileNotFoundException */
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
    String baseDir = "/home/chrisschaefer/";
    //String wikiDumpFile = "Downloads/enwiki-20130604-pages-articles.xml.bz2";
    String wikiDumpFile = "enwiki-20130604-pages-articlese.xml.bz2";
    String luceneIndexName = "enwiki-20130604-lucene2";

    System.currentTimeMillis();/*from   w w w .j a  v a  2  s .  c om*/
    boolean bIgnoreStubs = false;

    for (int i = 0; i < args.length; ++i) {
        if (args[i].equals("-luceneindex"))
            luceneIndexName = args[++i];

        if (args[i].equals("-basedir"))
            baseDir = args[++i];

        if (args[i].equals("-dumpfile"))
            wikiDumpFile = args[++i];

        if (args[i].equals("-includestubs"))
            bIgnoreStubs = true;
    }
    String rawTextPath = baseDir + luceneIndexName + "-raw-text.txt";
    String logPath = baseDir + luceneIndexName + ".log";
    PrintWriter artikelTextWriter = new PrintWriter(rawTextPath, "UTF-8");
    PrintWriter logger = new PrintWriter(logPath, "UTF-8");
    logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'");
    System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'");

    Date start = new Date();

    try {

        Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName));

        Analyzer analyzer = new WikipediaAnalyzer();
        //         Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);

        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(OpenMode.CREATE);
        iwc.setSimilarity(new ESASimilarity());

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmxm or -Xmx1g):
        //
        iwc.setRAMBufferSizeMB(2000.0);

        IndexWriter writer = new IndexWriter(dir, iwc);

        Extractor wikidumpExtractor = new Extractor(baseDir + File.separator + wikiDumpFile);
        wikidumpExtractor.setLinkSeparator("_");
        wikidumpExtractor.setCategorySeparator("_");
        wikidumpExtractor.setTitleSeparator(" ");

        int iStubs = 0;
        int iArticleCount = 0;
        int iSkippedPageCount = 0;
        long iStartTime = java.lang.System.nanoTime();
        long iTime = iStartTime;

        while (wikidumpExtractor.nextPage()) {
            if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) {
                ++iSkippedPageCount;
                continue;
            }

            if (bIgnoreStubs && wikidumpExtractor.getStub()) {
                ++iStubs;
                continue;
            }

            // skip pages with less than 5 out links
            if (wikidumpExtractor.getPageLinkList(true).size() < 5) {
                ++iSkippedPageCount;
                continue;
            }
            if (wikidumpExtractor.getPageCategories().equals("")) {
                ++iSkippedPageCount;
                logger.println("skipped because of stop category: " + wikidumpExtractor.getPageTitle(false));
                continue;
            } else {
                for (String link : wikidumpExtractor.getPageLinkList(false)) {
                    //                    artikelTextWriter.println(link);
                    if (_inLinks.containsKey(link)) {
                        int tmp = _inLinks.get(link);
                        tmp++;
                        _inLinks.put(link, tmp);
                    } else {
                        _inLinks.put(link, 1);
                    }
                }
            }
            if (wikidumpExtractor.getPageText().equals("")) {
                ++iSkippedPageCount;
                continue;
            }
            artikelTextWriter.println(
                    wikidumpExtractor.getPageTitle(false) + "\t" + wikidumpExtractor.getPageText(false));

            ++iArticleCount;

            if (iArticleCount % 1000 == 0) {
                logger.println(new Date().toString() + " phase 1 -- iArticleCount: " + iArticleCount
                        + " iSkippedPageCount: " + iSkippedPageCount);
            }
        }
        artikelTextWriter.close();
        iArticleCount = 0;

        PrintWriter artikelInLinkWriter = new PrintWriter(baseDir + luceneIndexName + "-inlinks.txt", "UTF-8");
        BufferedReader br = new BufferedReader(new FileReader(rawTextPath));
        String line = br.readLine();

        while (line != null) {
            int endOfTitle = line.indexOf("\t");
            String title = line.substring(0, endOfTitle);
            if (_inLinks.containsKey(title)) {
                int inlinks = _inLinks.get(title);
                artikelInLinkWriter.println(title + "\t" + inlinks);
                if (inlinks > 4) {
                    //System.out.println("inlinks > 0 ");
                    Document doc = new Document();
                    ++iArticleCount;

                    //                    wikidumpExtractor.setTitleSeparator( "_" );
                    //                    doc.add( new TextField( "url_title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) );

                    // doc.add( new TextField( "title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) );
                    //doc.add(new LongField("wiki_id", wikidumpExtractor.getPageId(), Field.Store.YES));
                    doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " "
                            + line.substring(endOfTitle + 1), Field.Store.NO));
                    //                  System.out.println(title + " " + 
                    //                        title + " " + 
                    //                        title + " " + 
                    //                        title + " " +
                    //                        line.substring(endOfTitle+1));

                    writer.addDocument(doc);

                    if (iArticleCount % 1000 == 0) {
                        writer.commit();
                        logger.println(new Date().toString() + " phase 2 -- iArticleCount: " + iArticleCount
                                + " iSkippedPageCount: " + iSkippedPageCount);
                    }
                }
            } else {
                artikelInLinkWriter.println(title + "\t0");
            }
            line = br.readLine();
        }
        br.close();
        artikelInLinkWriter.close();

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        writer.commit();
        writer.forceMerge(1);
        writer.close();

        Date end = new Date();
        String endStatement = end.getTime() - start.getTime() + " total milliseconds ("
                + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles.";
        logger.println(endStatement);
        System.out.println(endStatement);
        logger.close();
    } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:MakeLuceneIndexPreprocessed.java

License:Apache License

/** Index all text files under a directory. 
 * @throws UnsupportedEncodingException 
 * @throws FileNotFoundException */
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
    String baseDir = "/home/chrisschaefer/";

    String inputLuceneIndexName = "2013-06-18-lucene-gab";
    String luceneIndexName = "2013-06-18-lucene-gab-standard";

    System.currentTimeMillis();/*from w  w  w .  j a  v a2 s .c om*/

    for (int i = 0; i < args.length; ++i) {
        if (args[i].equals("-inputluceneindex"))
            inputLuceneIndexName = args[++i];

        if (args[i].equals("-outputluceneindex"))
            luceneIndexName = args[++i];

        if (args[i].equals("-basedir"))
            baseDir = args[++i];

    }
    String rawTextPath = baseDir + inputLuceneIndexName + "-raw-text.txt";
    String artikelInLinksPath = baseDir + inputLuceneIndexName + "-inlinks.txt";
    String logPath = baseDir + inputLuceneIndexName + ".log";

    PrintWriter logger = new PrintWriter(logPath, "UTF-8");
    logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'");
    System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'");

    Date start = new Date();
    logger.println(start.toString() + " iArticleCount: 0 iSkippedPageCount: 0");

    try {

        Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName));

        //         Analyzer analyzer = new WikipediaAnalyzer();
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);

        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(OpenMode.CREATE);

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmxm or -Xmx1g):
        //
        iwc.setRAMBufferSizeMB(2000.0);
        //         iwc.setSimilarity(new ESASimilarity());

        IndexWriter writer = new IndexWriter(dir, iwc);

        int iArticleCount = 0;
        int iSkippedPageCount = 0;

        BufferedReader rawTextReader = new BufferedReader(new FileReader(rawTextPath));
        BufferedReader artikelInLinksReader = new BufferedReader(new FileReader(artikelInLinksPath));
        String lineText = rawTextReader.readLine();
        String lineLinks = artikelInLinksReader.readLine();

        while (lineText != null) {
            //            String title = lineText.substring(0, lineText.indexOf("\t")); 
            //            while(!title.equals(lineLinks.substring(0, lineLinks.indexOf("\t")))){
            //               lineLinks = artikelInLinksReader.readLine();
            //            }
            int endOfTitle = lineText.indexOf("\t");
            String title = lineText.substring(0, endOfTitle);

            if (Integer.valueOf(lineLinks.substring(lineLinks.indexOf("\t") + 1)) > 0) {
                ++iArticleCount;
                Document doc = new Document();
                doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " "
                        + lineText.substring(endOfTitle + 1), Field.Store.NO));
                //               System.out.println(title + " " + 
                //               title + " " + 
                //               title + " " + 
                //               title + " " +
                //               lineText.substring(endOfTitle+1));
                writer.addDocument(doc);

                if (iArticleCount % 1000 == 0) {
                    writer.commit();
                    logger.println(new Date().toString() + "phase 2 -- iArticleCount: " + iArticleCount
                            + " iSkippedPageCount: " + iSkippedPageCount);
                    logger.flush();
                }
            }
            lineText = rawTextReader.readLine();
            lineLinks = artikelInLinksReader.readLine();
        }
        rawTextReader.close();
        artikelInLinksReader.close();

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        writer.commit();
        writer.forceMerge(1);
        writer.close();

        Date end = new Date();
        String endStatement = end.getTime() - start.getTime() + " total milliseconds ("
                + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles.";
        logger.println(endStatement);
        System.out.println(endStatement);
        logger.close();
    } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:IrqaQuery.java

License:Apache License

public static void indexDoc(String docid, String... args) throws IOException {
    //        docid, title, contents,...
    Document doc = new Document();

    Field pathField = new StringField("docid", docid, Field.Store.YES);
    doc.add(pathField);//from   w ww .  j ava 2  s  . c o  m

    for (int i = 0; i < args.length; i += 2) {
        String field = args[i];
        String field_text = args[i + 1];
        doc.add(new TextField(field, field_text, Field.Store.NO));
        //            System.out.println("[doc.add]" + path + ":" + field + ":" + field_text);
    }

    System.out.println("adding " + docid);
    writer.addDocument(doc);
}

From source file:LuceneIndexDirectoryOrFile.java

License:Apache License

/**
 * Indexes the given file using the given writer, or if a directory is given,
 * recurses over files and directories found under the given directory.
 * //from  w  ww  .  ja  va2s  .  c om
 * NOTE: This method indexes one document per input file.  This is slow.  For good
 * throughput, put multiple documents into your input file(s).  An example of this is
 * in the benchmark module, which can create "line doc" files, one document per line,
 * using the
 * <a href="../../../../../contrib-benchmark/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.html"
 * >WriteLineDocTask</a>.
 *  
 * @param writer Writer to the index where the given file/dir info will be stored
 * @param file The file to index, or the directory to recurse into to find files to index
 * @throws IOException If there is a low-level I/O error
 */
static void indexDocs(IndexWriter writer, File file) throws IOException {
    // do not try to index files that cannot be read
    if (file.canRead()) {
        if (file.isDirectory()) {
            String[] files = file.list();
            // an IO error could occur
            if (files != null) {
                for (int i = 0; i < files.length; i++) {
                    indexDocs(writer, new File(file, files[i]));
                }
            }
        } else {

            FileInputStream fis;
            try {
                fis = new FileInputStream(file);
            } catch (FileNotFoundException fnfe) {
                // at least on windows, some temporary files raise this exception with an "access denied" message
                // checking if the file can be read doesn't help
                return;
            }

            try {

                //    File file = new File(fileToBeIndexed);
                BufferedReader br = new BufferedReader(new FileReader(file));
                String title = "";
                String docno = "";
                String text = "";
                String line = "";
                boolean docStarted = false;
                Document doc = null;
                while ((line = br.readLine()) != null) {

                    //Note that these fields are part of a TRECtext file
                    if (line.indexOf("<DOC>") > -1) {
                        docStarted = true;
                        doc = new Document();
                    } else if (line.indexOf("</DOC>") > -1) {
                        docStarted = false;

                        /*             Previous versions had fields with parameters
                                           doc.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED));
                                           doc.add(new Field("docno", docno, Field.Store.NO, Field.Index.NOT_ANALYZED));
                                           doc.add(new Field("text", text, Field.Store.YES, Field.Index.ANALYZED));
                                                   
                                          StringField -- not analyzed, in one chunk
                                          TextFiled -- analyzed
                                                   
                         */

                        doc.add(new StringField("docno", docno, Field.Store.YES));
                        doc.add(new TextField("title", title, Field.Store.YES));
                        doc.add(new TextField("text", text, Field.Store.YES));

                        writer.addDocument(doc);
                    }
                    if (docStarted) {
                        int i = -1;
                        if (((i = line.indexOf("<TITLE>")) > -1) && (line.indexOf("</TITLE>") > -1)) {
                            title = (line.substring(i + "<TITLE>".length(), line.indexOf("</TITLE>")));
                        } else if ((i = line.indexOf("<TEXT>")) > -1) {
                            text = line.substring(i + "<TEXT>".length());
                        } else if ((i = line.indexOf("<DOCNO>")) > -1) {
                            docno = line.substring(i + "<DOCNO>".length(), line.indexOf("</DOCNO>"));
                        }

                    }
                }
                br.close();
                System.out.println("adding " + file);
                /*
                          // make a new, empty document
                          Document doc = new Document();
                                  
                          // Add the path of the file as a field named "path".  Use a
                          // field that is indexed (i.e. searchable), but don't tokenize 
                          // the field into separate words and don't index term frequency
                          // or positional information:
                          Field pathField = new StringField("path", file.getPath(), Field.Store.YES);
                          doc.add(pathField);
                        
                          // Add the last modified date of the file a field named "modified".
                          // Use a LongField that is indexed (i.e. efficiently filterable with
                          // NumericRangeFilter).  This indexes to milli-second resolution, which
                          // is often too fine.  You could instead create a number based on
                          // year/month/day/hour/minutes/seconds, down the resolution you require.
                          // For example the long value 2011021714 would mean
                          // February 17, 2011, 2-3 PM.
                          doc.add(new LongField("modified", file.lastModified(), Field.Store.NO));
                        
                          // Add the contents of the file to a field named "contents".  Specify a Reader,
                          // so that the text of the file is tokenized and indexed, but not stored.
                          // Note that FileReader expects the file to be in UTF-8 encoding.
                          // If that's not the case searching for special characters will fail.
                          doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));
                               
                          if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
                            // New index, so we just add the document (no old document can be there):
                            System.out.println("adding " + file);
                            writer.addDocument(doc);
                          } else {
                            // Existing index (an old copy of this document may have been indexed) so 
                            // we use updateDocument instead to replace the old one matching the exact 
                            // path, if present:
                            System.out.println("updating " + file);
                            writer.updateDocument(new Term("path", file.getPath()), doc);
                          }
                       */
            } finally {
                fis.close();
            }

        }
    }
}

From source file:IndexTaxis.java

License:Apache License

static void addOneField(Document doc, String fieldName, String rawValue) {
    // nocommit/*from   w ww  .j a v a  2  s  .c  om*/
    /*
    if (fieldName.equals("pick_up_lat")) {
      double value = Double.parseDouble(rawValue);
      doc.add(new DoublePoint(fieldName, value));
      doc.add(new SortedNumericDocValuesField(fieldName, NumericUtils.doubleToSortableLong(value)));
    }
    */
    switch (fieldName) {
    case "vendor_id":
    case "cab_color":
    case "payment_type":
    case "trip_type":
    case "rate_code":
    case "store_and_fwd_flag":
        doc.add(new StringField(fieldName, rawValue, Field.Store.NO));
        doc.add(new SortedSetDocValuesField(fieldName, new BytesRef(rawValue)));
        break;
    case "vendor_name":
        doc.add(new TextField(fieldName, rawValue, Field.Store.NO));
        break;
    case "pick_up_date_time":
    case "drop_off_date_time": {
        long value = Long.parseLong(rawValue);
        doc.add(new LongPoint(fieldName, value));
        doc.add(new SortedNumericDocValuesField(fieldName, value));
    }
        break;
    case "passenger_count": {
        int value = Integer.parseInt(rawValue);
        doc.add(new IntPoint(fieldName, value));
        doc.add(new SortedNumericDocValuesField(fieldName, value));
    }
        break;
    case "trip_distance":
    case "pick_up_lat":
    case "pick_up_lon":
    case "drop_off_lat":
    case "drop_off_lon":
    case "fare_amount":
    case "surcharge":
    case "mta_tax":
    case "extra":
    case "ehail_fee":
    case "improvement_surcharge":
    case "tip_amount":
    case "tolls_amount":
    case "total_amount": {
        double value;
        try {
            value = Double.parseDouble(rawValue);
        } catch (NumberFormatException nfe) {
            System.out.println(
                    "WARNING: failed to parse \"" + rawValue + "\" as double for field \"" + fieldName + "\"");
            return;
        }
        doc.add(new DoublePoint(fieldName, value));
        doc.add(new SortedNumericDocValuesField(fieldName, NumericUtils.doubleToSortableLong(value)));
    }
        break;
    default:
        throw new AssertionError("failed to handle field \"" + fieldName + "\"");
    }
}

From source file:antnlp.opie.indexsearch.IndexFiles.java

License:Apache License

/** Indexes a single document */
static void indexDoc(IndexWriter writer, Path file, long lastModified) throws IOException {

    InputStreamReader iReader = new InputStreamReader(Files.newInputStream(file), StandardCharsets.UTF_8);
    BufferedReader bufReader = new BufferedReader(iReader);

    String docLine = null;/*  www .  j av  a2 s . c  om*/
    while ((docLine = bufReader.readLine()) != null) {
        docLine = docLine.trim();
        if (docLine.length() == 0)
            continue;
        String[] column = docLine.split("\\t");
        System.out.println(column[0]);
        System.out.println(column[1]);

        // make a new, empty document
        Document doc = new Document();

        // Add the id of the file as a field named "id".  Use a
        // field that is indexed (i.e. searchable), but don't tokenize 
        // the field into separate words and don't index term frequency
        // or positional information:
        Field docidField = new StringField("docid", column[0], Field.Store.YES);
        doc.add(docidField);

        // Add the last modified date of the file a field named "modified".
        // Use a LongPoint that is indexed (i.e. efficiently filterable with
        // PointRangeQuery).  This indexes to milli-second resolution, which
        // is often too fine.  You could instead create a number based on
        // year/month/day/hour/minutes/seconds, down the resolution you require.
        // For example the long value 2011021714 would mean
        // February 17, 2011, 2-3 PM.
        doc.add(new LongPoint("modified", lastModified));

        // Add the contents of the file to a field named "contents".  Specify a Reader,
        // so that the text of the file is tokenized and indexed, but not stored.
        // Note that FileReader expects the file to be in UTF-8 encoding.
        // If that's not the case searching for special characters will fail.
        doc.add(new TextField("contents", column[1], Field.Store.YES));

        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can be there):
            System.out.println("adding " + column[0]);
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been indexed) so 
            // we use updateDocument instead to replace the old one matching the exact 
            // path, if present:
            System.out.println("updating " + column[0]);
            writer.updateDocument(new Term("docid", column[0]), doc);
        }
    }
    iReader.close();
    bufReader.close();
}

From source file:Application.mediaIndexer.java

/**
 * Indexes a single document/*from   w  w  w . ja  va  2  s.  co m*/
 * 
 * @throws TikaException
 * @throws SAXException
 */
public static void indexDoc(IndexWriter writer, Path file, TextArea results, long lastModified)
        throws IOException, SAXException, TikaException {
    AutoDetectParser parser = new AutoDetectParser();
    BodyContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    try (InputStream stream = Files.newInputStream(file)) {
        parser.parse(stream, handler, metadata);
        Document doc = new Document();
        String[] metadataNames = metadata.names();
        for (String name : metadataNames)
            doc.add(new TextField(name, metadata.get(name), Field.Store.YES));
        doc.add(new StringField("path", file.toString(), Field.Store.YES));
        doc.add(new LongPoint("modified", lastModified));
        results.appendText("Title: " + metadata.get("title") + "\n");
        results.appendText("Artists: " + metadata.get("xmpDM:artist") + "\n");
        results.appendText("Genre: " + metadata.get("xmpDM:genre") + "\n");
        results.appendText("Year: " + metadata.get("xmpDM:releaseDate") + "\n");
        if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
            // New index, so we just add the document (no old document can
            // be there):
            results.appendText("adding " + file + "\n");
            writer.addDocument(doc);
        } else {
            // Existing index (an old copy of this document may have been
            // indexed):
            results.appendText("updating " + file);
            writer.updateDocument(new Term("path", file.toString()), doc);
        }
    }
}

From source file:apps.LuceneIndexer.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();

    options.addOption("i", null, true, "input file");
    options.addOption("o", null, true, "output directory");
    options.addOption("r", null, true, "optional output TREC-format QREL file");

    options.addOption("bm25_b", null, true, "BM25 parameter: b");
    options.addOption("bm25_k1", null, true, "BM25 parameter: k1");
    options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity");

    Joiner commaJoin = Joiner.on(',');
    Joiner spaceJoin = Joiner.on(' ');

    options.addOption("source_type", null, true,
            "document source type: " + commaJoin.join(SourceFactory.getDocSourceList()));

    // If you increase this value, you may need to modify the following line in *.sh file
    // export MAVEN_OPTS="-Xms8192m -server"
    double ramBufferSizeMB = 1024 * 8; // 8 GB

    CommandLineParser parser = new org.apache.commons.cli.GnuParser();

    IndexWriter indexWriter = null;/* www.  java2s .  c  o m*/
    BufferedWriter qrelWriter = null;

    int docNum = 0;

    try {
        CommandLine cmd = parser.parse(options, args);

        String inputFileName = null, outputDirName = null, qrelFileName = null;

        if (cmd.hasOption("i")) {
            inputFileName = cmd.getOptionValue("i");
        } else {
            Usage("Specify 'input file'", options);
        }

        if (cmd.hasOption("o")) {
            outputDirName = cmd.getOptionValue("o");
        } else {
            Usage("Specify 'index directory'", options);
        }

        if (cmd.hasOption("r")) {
            qrelFileName = cmd.getOptionValue("r");
        }

        String sourceName = cmd.getOptionValue("source_type");

        if (sourceName == null)
            Usage("Specify document source type", options);

        if (qrelFileName != null)
            qrelWriter = new BufferedWriter(new FileWriter(qrelFileName));

        File outputDir = new File(outputDirName);
        if (!outputDir.exists()) {
            if (!outputDir.mkdirs()) {
                System.out.println("couldn't create " + outputDir.getAbsolutePath());
                System.exit(1);
            }
        }
        if (!outputDir.isDirectory()) {
            System.out.println(outputDir.getAbsolutePath() + " is not a directory!");
            System.exit(1);
        }
        if (!outputDir.canWrite()) {
            System.out.println("Can't write to " + outputDir.getAbsolutePath());
            System.exit(1);
        }

        boolean useFixedBM25 = cmd.hasOption("bm25fixed");

        float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT;

        if (cmd.hasOption("bm25_k1")) {
            try {
                bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_k1'", options);
            }
        }

        if (cmd.hasOption("bm25_b")) {
            try {
                bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_b'", options);
            }
        }

        EnglishAnalyzer analyzer = new EnglishAnalyzer();
        FSDirectory indexDir = FSDirectory.open(Paths.get(outputDirName));
        IndexWriterConfig indexConf = new IndexWriterConfig(analyzer);

        /*
            OpenMode.CREATE creates a new index or overwrites an existing one.
            https://lucene.apache.org/core/6_0_0/core/org/apache/lucene/index/IndexWriterConfig.OpenMode.html#CREATE
        */
        indexConf.setOpenMode(OpenMode.CREATE);
        indexConf.setRAMBufferSizeMB(ramBufferSizeMB);

        System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b));

        if (useFixedBM25) {
            System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25SimilarityFix(bm25_k1, bm25_b));
        } else {
            System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25Similarity(bm25_k1, bm25_b));
        }

        indexWriter = new IndexWriter(indexDir, indexConf);

        DocumentSource inpDocSource = SourceFactory.createDocumentSource(sourceName, inputFileName);
        DocumentEntry inpDoc = null;
        TextCleaner textCleaner = new TextCleaner(null);

        while ((inpDoc = inpDocSource.next()) != null) {
            ++docNum;

            Document luceneDoc = new Document();
            ArrayList<String> cleanedToks = textCleaner.cleanUp(inpDoc.mDocText);
            String cleanText = spaceJoin.join(cleanedToks);

            //        System.out.println(inpDoc.mDocId);
            //        System.out.println(cleanText);
            //        System.out.println("==============================");

            luceneDoc.add(new StringField(UtilConst.FIELD_ID, inpDoc.mDocId, Field.Store.YES));
            luceneDoc.add(new TextField(UtilConst.FIELD_TEXT, cleanText, Field.Store.YES));
            indexWriter.addDocument(luceneDoc);

            if (inpDoc.mIsRel != null && qrelWriter != null) {
                saveQrelOneEntry(qrelWriter, inpDoc.mQueryId, inpDoc.mDocId, inpDoc.mIsRel ? MAX_GRADE : 0);
            }
            if (docNum % 1000 == 0)
                System.out.println(String.format("Indexed %d documents", docNum));

        }

    } catch (ParseException e) {
        e.printStackTrace();
        Usage("Cannot parse arguments" + e, options);
    } catch (Exception e) {
        System.err.println("Terminating due to an exception: " + e);
        System.exit(1);
    } finally {
        System.out.println(String.format("Indexed %d documents", docNum));

        try {
            if (null != indexWriter)
                indexWriter.close();
            if (null != qrelWriter)
                qrelWriter.close();
        } catch (IOException e) {
            System.err.println("IO exception: " + e);
            e.printStackTrace();
        }
    }
}