Example usage for org.apache.lucene.index IndexWriter close

List of usage examples for org.apache.lucene.index IndexWriter close

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriter close.

Prototype

@Override
public void close() throws IOException 

Source Link

Document

Closes all open resources and releases the write lock.

Usage

From source file:MakeLuceneIndex.java

License:Apache License

/** Index all text files under a directory. 
 * @throws UnsupportedEncodingException 
 * @throws FileNotFoundException */
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
    String baseDir = "/home/chrisschaefer/";
    //String wikiDumpFile = "Downloads/enwiki-20130604-pages-articles.xml.bz2";
    String wikiDumpFile = "enwiki-20130604-pages-articlese.xml.bz2";
    String luceneIndexName = "enwiki-20130604-lucene2";

    System.currentTimeMillis();/*from  w w  w  . j a v  a  2 s.  co  m*/
    boolean bIgnoreStubs = false;

    for (int i = 0; i < args.length; ++i) {
        if (args[i].equals("-luceneindex"))
            luceneIndexName = args[++i];

        if (args[i].equals("-basedir"))
            baseDir = args[++i];

        if (args[i].equals("-dumpfile"))
            wikiDumpFile = args[++i];

        if (args[i].equals("-includestubs"))
            bIgnoreStubs = true;
    }
    String rawTextPath = baseDir + luceneIndexName + "-raw-text.txt";
    String logPath = baseDir + luceneIndexName + ".log";
    PrintWriter artikelTextWriter = new PrintWriter(rawTextPath, "UTF-8");
    PrintWriter logger = new PrintWriter(logPath, "UTF-8");
    logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'");
    System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'");

    Date start = new Date();

    try {

        Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName));

        Analyzer analyzer = new WikipediaAnalyzer();
        //         Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);

        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(OpenMode.CREATE);
        iwc.setSimilarity(new ESASimilarity());

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmxm or -Xmx1g):
        //
        iwc.setRAMBufferSizeMB(2000.0);

        IndexWriter writer = new IndexWriter(dir, iwc);

        Extractor wikidumpExtractor = new Extractor(baseDir + File.separator + wikiDumpFile);
        wikidumpExtractor.setLinkSeparator("_");
        wikidumpExtractor.setCategorySeparator("_");
        wikidumpExtractor.setTitleSeparator(" ");

        int iStubs = 0;
        int iArticleCount = 0;
        int iSkippedPageCount = 0;
        long iStartTime = java.lang.System.nanoTime();
        long iTime = iStartTime;

        while (wikidumpExtractor.nextPage()) {
            if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) {
                ++iSkippedPageCount;
                continue;
            }

            if (bIgnoreStubs && wikidumpExtractor.getStub()) {
                ++iStubs;
                continue;
            }

            // skip pages with less than 5 out links
            if (wikidumpExtractor.getPageLinkList(true).size() < 5) {
                ++iSkippedPageCount;
                continue;
            }
            if (wikidumpExtractor.getPageCategories().equals("")) {
                ++iSkippedPageCount;
                logger.println("skipped because of stop category: " + wikidumpExtractor.getPageTitle(false));
                continue;
            } else {
                for (String link : wikidumpExtractor.getPageLinkList(false)) {
                    //                    artikelTextWriter.println(link);
                    if (_inLinks.containsKey(link)) {
                        int tmp = _inLinks.get(link);
                        tmp++;
                        _inLinks.put(link, tmp);
                    } else {
                        _inLinks.put(link, 1);
                    }
                }
            }
            if (wikidumpExtractor.getPageText().equals("")) {
                ++iSkippedPageCount;
                continue;
            }
            artikelTextWriter.println(
                    wikidumpExtractor.getPageTitle(false) + "\t" + wikidumpExtractor.getPageText(false));

            ++iArticleCount;

            if (iArticleCount % 1000 == 0) {
                logger.println(new Date().toString() + " phase 1 -- iArticleCount: " + iArticleCount
                        + " iSkippedPageCount: " + iSkippedPageCount);
            }
        }
        artikelTextWriter.close();
        iArticleCount = 0;

        PrintWriter artikelInLinkWriter = new PrintWriter(baseDir + luceneIndexName + "-inlinks.txt", "UTF-8");
        BufferedReader br = new BufferedReader(new FileReader(rawTextPath));
        String line = br.readLine();

        while (line != null) {
            int endOfTitle = line.indexOf("\t");
            String title = line.substring(0, endOfTitle);
            if (_inLinks.containsKey(title)) {
                int inlinks = _inLinks.get(title);
                artikelInLinkWriter.println(title + "\t" + inlinks);
                if (inlinks > 4) {
                    //System.out.println("inlinks > 0 ");
                    Document doc = new Document();
                    ++iArticleCount;

                    //                    wikidumpExtractor.setTitleSeparator( "_" );
                    //                    doc.add( new TextField( "url_title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) );

                    // doc.add( new TextField( "title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) );
                    //doc.add(new LongField("wiki_id", wikidumpExtractor.getPageId(), Field.Store.YES));
                    doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " "
                            + line.substring(endOfTitle + 1), Field.Store.NO));
                    //                  System.out.println(title + " " + 
                    //                        title + " " + 
                    //                        title + " " + 
                    //                        title + " " +
                    //                        line.substring(endOfTitle+1));

                    writer.addDocument(doc);

                    if (iArticleCount % 1000 == 0) {
                        writer.commit();
                        logger.println(new Date().toString() + " phase 2 -- iArticleCount: " + iArticleCount
                                + " iSkippedPageCount: " + iSkippedPageCount);
                    }
                }
            } else {
                artikelInLinkWriter.println(title + "\t0");
            }
            line = br.readLine();
        }
        br.close();
        artikelInLinkWriter.close();

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        writer.commit();
        writer.forceMerge(1);
        writer.close();

        Date end = new Date();
        String endStatement = end.getTime() - start.getTime() + " total milliseconds ("
                + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles.";
        logger.println(endStatement);
        System.out.println(endStatement);
        logger.close();
    } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:MakeLuceneIndexPreprocessed.java

License:Apache License

/** Index all text files under a directory. 
 * @throws UnsupportedEncodingException 
 * @throws FileNotFoundException */
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
    String baseDir = "/home/chrisschaefer/";

    String inputLuceneIndexName = "2013-06-18-lucene-gab";
    String luceneIndexName = "2013-06-18-lucene-gab-standard";

    System.currentTimeMillis();//from w ww.  j av a2s  .  c  om

    for (int i = 0; i < args.length; ++i) {
        if (args[i].equals("-inputluceneindex"))
            inputLuceneIndexName = args[++i];

        if (args[i].equals("-outputluceneindex"))
            luceneIndexName = args[++i];

        if (args[i].equals("-basedir"))
            baseDir = args[++i];

    }
    String rawTextPath = baseDir + inputLuceneIndexName + "-raw-text.txt";
    String artikelInLinksPath = baseDir + inputLuceneIndexName + "-inlinks.txt";
    String logPath = baseDir + inputLuceneIndexName + ".log";

    PrintWriter logger = new PrintWriter(logPath, "UTF-8");
    logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'");
    System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'");

    Date start = new Date();
    logger.println(start.toString() + " iArticleCount: 0 iSkippedPageCount: 0");

    try {

        Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName));

        //         Analyzer analyzer = new WikipediaAnalyzer();
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);

        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(OpenMode.CREATE);

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmxm or -Xmx1g):
        //
        iwc.setRAMBufferSizeMB(2000.0);
        //         iwc.setSimilarity(new ESASimilarity());

        IndexWriter writer = new IndexWriter(dir, iwc);

        int iArticleCount = 0;
        int iSkippedPageCount = 0;

        BufferedReader rawTextReader = new BufferedReader(new FileReader(rawTextPath));
        BufferedReader artikelInLinksReader = new BufferedReader(new FileReader(artikelInLinksPath));
        String lineText = rawTextReader.readLine();
        String lineLinks = artikelInLinksReader.readLine();

        while (lineText != null) {
            //            String title = lineText.substring(0, lineText.indexOf("\t")); 
            //            while(!title.equals(lineLinks.substring(0, lineLinks.indexOf("\t")))){
            //               lineLinks = artikelInLinksReader.readLine();
            //            }
            int endOfTitle = lineText.indexOf("\t");
            String title = lineText.substring(0, endOfTitle);

            if (Integer.valueOf(lineLinks.substring(lineLinks.indexOf("\t") + 1)) > 0) {
                ++iArticleCount;
                Document doc = new Document();
                doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " "
                        + lineText.substring(endOfTitle + 1), Field.Store.NO));
                //               System.out.println(title + " " + 
                //               title + " " + 
                //               title + " " + 
                //               title + " " +
                //               lineText.substring(endOfTitle+1));
                writer.addDocument(doc);

                if (iArticleCount % 1000 == 0) {
                    writer.commit();
                    logger.println(new Date().toString() + "phase 2 -- iArticleCount: " + iArticleCount
                            + " iSkippedPageCount: " + iSkippedPageCount);
                    logger.flush();
                }
            }
            lineText = rawTextReader.readLine();
            lineLinks = artikelInLinksReader.readLine();
        }
        rawTextReader.close();
        artikelInLinksReader.close();

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        writer.commit();
        writer.forceMerge(1);
        writer.close();

        Date end = new Date();
        String endStatement = end.getTime() - start.getTime() + " total milliseconds ("
                + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles.";
        logger.println(endStatement);
        System.out.println(endStatement);
        logger.close();
    } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:FileIndexer.java

License:Apache License

public static void main(String[] args) {
    String usage = "java FileIndexer" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-excludes FILE] [-update]\n\n"
            + "This indexes the documents in DOCS_PATH, creating a Lucene index"
            + "in INDEX_PATH that can be searched with SearchFiles\n"
            + "excludes is an optional list of files to be excluded, one per line.";
    String indexPath = "index";
    String docsPath = null;//  w w  w .  ja  v a2  s  .c  o  m
    boolean create = true;
    List<String> excludes = new ArrayList<String>();
    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) {
            indexPath = args[i + 1];
            i++;
        } else if ("-docs".equals(args[i])) {
            docsPath = args[i + 1];
            i++;
        } else if ("-excludes".equals(args[i])) {
            Scanner sc = null;
            try {
                sc = new Scanner(new File(args[i + 1]));
                i++;
            } catch (FileNotFoundException fnfe) {
                System.err.println(fnfe.getMessage());
                System.exit(1);
            }
            while (sc.hasNext()) {
                excludes.add(sc.next());
            }
            sc.close();
        } else if ("-update".equals(args[i])) {
            create = false;
        }
    }

    if (docsPath == null) {
        System.err.println("Usage: " + usage);
        System.exit(1);
    }

    final Path docDir = Paths.get(docsPath);
    if (!Files.isReadable(docDir)) {
        System.out.println("Document directory '" + docDir.toAbsolutePath()
                + "' does not exist or is not readable, please check the path");
        System.exit(1);
    }

    Date start = new Date();
    try {
        System.out.println("Indexing to directory '" + indexPath + "'...");

        Directory dir = FSDirectory.open(Paths.get(indexPath));
        Analyzer analyzer = new LimitTokenCountAnalyzer(new StandardAnalyzer(), 1000000);
        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

        if (create) {
            // Create a new index in the directory, removing any
            // previously indexed documents:
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            // Add new documents to an existing index:
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmx512m or -Xmx1g):
        //
        // iwc.setRAMBufferSizeMB(256.0);

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, docDir, excludes);

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        // writer.forceMerge(1);

        writer.close();

        Date end = new Date();
        System.out.println(end.getTime() - start.getTime() + " total milliseconds");

    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }

}

From source file:LuceneIndexDirectoryOrFile.java

License:Apache License

/** Index all text files under a directory. */
public static void main(String[] args) {
    String usage = "java org.apache.lucene.demo.IndexFiles"
            + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
            + "This indexes the documents in DOCS_PATH, creating a Lucene index"
            + "in INDEX_PATH that can be searched with SearchFiles";
    //String indexPath = "index";
    //String docsPath = null;

    String indexPath = indexDir;/*from w  ww .ja va 2  s  .c o m*/
    String docsPath = sourceDir;

    boolean create = true;

    if (docsPath == null) {
        System.err.println("Usage: " + usage);
        System.exit(1);
    }

    final File docDir = new File(docsPath);
    if (!docDir.exists() || !docDir.canRead()) {
        System.out.println("Document directory '" + docDir.getAbsolutePath()
                + "' does not exist or is not readable, please check the path");
        System.exit(1);
    }

    Date start = new Date();
    try {
        System.out.println("Indexing to directory '" + indexPath + "'...");

        Directory dir = FSDirectory.open(new File(indexPath));
        Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_40);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer);

        if (create) {
            // Create a new index in the directory, removing any
            // previously indexed documents:
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            // Add new documents to an existing index:
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmx512m or -Xmx1g):
        //
        // iwc.setRAMBufferSizeMB(256.0);

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, docDir);

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        // writer.forceMerge(1);

        writer.close();

        Date end = new Date();
        System.out.println(end.getTime() - start.getTime() + " total milliseconds");

    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:DVBench.java

License:Apache License

static void doBench(int bpv) throws Exception {
    File file = new File("/data/indices/dvbench");
    file.mkdirs();// w w  w  .j  a v  a2 s . com
    Directory dir = FSDirectory.open(file);
    IndexWriterConfig config = new IndexWriterConfig(null);
    config.setOpenMode(OpenMode.CREATE);
    config.setMergeScheduler(new SerialMergeScheduler());
    config.setMergePolicy(new LogDocMergePolicy());
    config.setMaxBufferedDocs(25000);
    IndexWriter writer = new IndexWriter(dir, config);

    MyRandom r = new MyRandom();
    int numdocs = 400000;
    Document doc = new Document();
    Field dv = new NumericDocValuesField("dv", 0);
    Field inv = new LongField("inv", 0, Field.Store.NO);
    Field boxed = new BinaryDocValuesField("boxed", new BytesRef(8));
    Field boxed2 = new BinaryDocValuesField("boxed2", new BytesRef(8));

    doc.add(dv);
    doc.add(inv);
    doc.add(boxed);
    doc.add(boxed2);
    for (int i = 0; i < numdocs; i++) {
        // defeat blockpackedwriter
        final long value;
        if (i % 8192 == 0) {
            value = bpv == 64 ? Long.MIN_VALUE : 0;
        } else if (i % 8192 == 1) {
            value = bpv == 64 ? Long.MAX_VALUE : (1L << bpv) - 1;
        } else {
            value = r.nextLong(bpv);
        }
        dv.setLongValue(value);
        inv.setLongValue(value);
        box(value, boxed.binaryValue());
        box(value, boxed2.binaryValue());
        boxed2.binaryValue().length = (bpv + 7) / 8; // fixed length
        writer.addDocument(doc);
    }

    writer.close();

    // run dv search tests
    String description = "dv (bpv=" + bpv + ")";
    DirectoryReader reader = DirectoryReader.open(dir);
    IndexSearcher searcher = new IndexSearcher(reader);
    searcher.setQueryCache(null); // don't bench the cache

    int hash = 0;
    // warmup
    hash += search(description, searcher, "dv", 300, true);
    hash += search(description, searcher, "dv", 300, false);

    // Uninverting
    Map<String, UninvertingReader.Type> mapping = Collections.singletonMap("inv", UninvertingReader.Type.LONG);
    DirectoryReader uninv = UninvertingReader.wrap(reader, mapping);
    IndexSearcher searcher2 = new IndexSearcher(uninv);
    searcher2.setQueryCache(null); // don't bench the cache

    description = "fc (bpv=" + bpv + ")";
    // warmup
    hash += search(description, searcher2, "inv", 300, true);
    hash += search(description, searcher2, "inv", 300, false);

    // Boxed inside binary
    DirectoryReader boxedReader = new BinaryAsVLongReader(reader);
    IndexSearcher searcher3 = new IndexSearcher(boxedReader);
    searcher3.setQueryCache(null); // don't bench the cache
    description = "boxed (bpv=" + bpv + ")";
    // warmup
    hash += search(description, searcher3, "boxed", 300, true);
    hash += search(description, searcher3, "boxed", 300, false);

    description = "boxed fixed-length (bpv=" + bpv + ")";
    // warmup
    hash += search(description, searcher3, "boxed2", 300, true);
    hash += search(description, searcher3, "boxed2", 300, false);

    if (hash == 3) {
        // wont happen
        System.out.println("hash=" + hash);
    }
    reader.close();
    dir.close();
}

From source file:TestWang.java

License:Open Source License

private void indexFiles(ArrayList<String> images, DocumentBuilder builder, String indexPath)
        throws IOException {
    //      System.out.println(">> Indexing " + images.size() + " files.");
    //      DocumentBuilder builder = DocumentBuilderFactory.getExtensiveDocumentBuilder();
    //      DocumentBuilder builder = DocumentBuilderFactory.getFastDocumentBuilder();
    IndexWriter iw = LuceneUtils.createIndexWriter(indexPath, true);
    int count = 0;
    long time = System.currentTimeMillis();
    for (String identifier : images) {
        Document doc = builder.createDocument(new FileInputStream(identifier), identifier);
        iw.addDocument(doc);// w w w  . java 2s . c o  m
        count++;
        if (count % 100 == 0)
            System.out.println(count + " files indexed.");
        //          if (count == 200) break;
    }
    long timeTaken = (System.currentTimeMillis() - time);
    float sec = ((float) timeTaken) / 1000f;

    System.out.println(sec + " seconds taken, " + (timeTaken / count) + " ms per image.");
    iw.optimize();
    iw.close();
}

From source file:IndexAndSearchOpenStreetMaps1D.java

License:Apache License

private static void createIndex() throws IOException {

    long t0 = System.nanoTime();

    CharsetDecoder decoder = StandardCharsets.UTF_8.newDecoder().onMalformedInput(CodingErrorAction.REPORT)
            .onUnmappableCharacter(CodingErrorAction.REPORT);

    int BUFFER_SIZE = 1 << 16; // 64K
    InputStream is = Files
            .newInputStream(Paths.get("/lucenedata/open-street-maps/latlon.subsetPlusAllLondon.txt"));
    BufferedReader reader = new BufferedReader(new InputStreamReader(is, decoder), BUFFER_SIZE);

    Directory dir = FSDirectory.open(Paths.get("/c/tmp/bkdtest1d" + (USE_NF ? "_nf" : "")));

    IndexWriterConfig iwc = new IndexWriterConfig(null);
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    //iwc.setMaxBufferedDocs(109630);
    //iwc.setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH);
    iwc.setRAMBufferSizeMB(256.0);/*www.jav a  2s  . c  o  m*/
    iwc.setMergePolicy(new LogDocMergePolicy());
    iwc.setMergeScheduler(new SerialMergeScheduler());
    iwc.setInfoStream(new PrintStreamInfoStream(System.out));
    IndexWriter w = new IndexWriter(dir, iwc);

    int count = 0;
    byte[] scratch = new byte[4];
    while (true) {
        String line = reader.readLine();
        if (line == null) {
            break;
        }

        String[] parts = line.split(",");
        //long id = Long.parseLong(parts[0]);
        int lat = (int) (1000000. * Double.parseDouble(parts[1]));
        //int lon = (int) (1000000. * Double.parseDouble(parts[2]));
        Document doc = new Document();
        if (USE_NF) {
            doc.add(new LegacyIntField("latnum", lat, Field.Store.NO));
            //doc.add(new LongField("lonnum", lon, Field.Store.NO));
        } else {
            doc.add(new IntPoint("lat", lat));
            //doc.add(new SortedNumericDocValuesField("lon", lon));
        }
        w.addDocument(doc);
        count++;
        if (count % 1000000 == 0) {
            System.out.println(count + "...");
        }
    }
    //w.forceMerge(1);
    w.commit();
    System.out.println(w.maxDoc() + " total docs");

    w.close();
    long t1 = System.nanoTime();
    System.out.println(((t1 - t0) / 1000000000.0) + " sec to build index");
}

From source file:IndexTaxis.java

License:Apache License

public static void main(String[] args) throws Exception {
    Path indexPath = Paths.get(args[0]);
    Directory dir = FSDirectory.open(indexPath);
    int threadCount = Integer.parseInt(args[1]);
    Path docsPath = Paths.get(args[2]);

    IndexWriterConfig iwc = new IndexWriterConfig();
    //System.out.println("NOW SET INFO STREAM");
    iwc.setRAMBufferSizeMB(1024.);/*from w w w  . j a v  a 2s.  c om*/
    iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    iwc.setInfoStream(new PrintStreamInfoStream(System.out));
    //((ConcurrentMergeScheduler) iwc.getMergeScheduler()).disableAutoIOThrottle();

    final IndexWriter w = new IndexWriter(dir, iwc);

    BufferedInputStream docs = new BufferedInputStream(Files.newInputStream(docsPath, StandardOpenOption.READ));

    // parse the header fields
    List<String> fieldsList = new ArrayList<>();
    StringBuilder builder = new StringBuilder();
    while (true) {
        int x = docs.read();
        if (x == -1) {
            throw new IllegalArgumentException(
                    "hit EOF while trying to read CSV header; are you sure you have the right CSV file!");
        }
        byte b = (byte) x;
        if (b == NEWLINE) {
            fieldsList.add(builder.toString());
            break;
        } else if (b == COMMA) {
            fieldsList.add(builder.toString());
            builder.setLength(0);
        } else {
            // this is OK because headers are all ascii:
            builder.append((char) b);
        }
    }

    final String[] fields = fieldsList.toArray(new String[fieldsList.size()]);

    Thread[] threads = new Thread[threadCount];

    final AtomicInteger docCounter = new AtomicInteger();
    final AtomicLong bytesCounter = new AtomicLong();

    startNS = System.nanoTime();

    for (int i = 0; i < threadCount; i++) {
        final int threadID = i;
        threads[i] = new Thread() {
            @Override
            public void run() {
                try {
                    _run();
                } catch (Exception e) {
                    throw new RuntimeException(e);
                }
            }

            private void _run() throws IOException {
                while (true) {
                    Chunk chunk = readChunk(docs);
                    if (chunk == null) {
                        break;
                    }
                    indexOneChunk(fields, chunk, w, docCounter, bytesCounter);
                }
            }
        };
        threads[i].start();
    }

    for (int i = 0; i < threadCount; i++) {
        threads[i].join();
    }
    System.out.println("Indexing done; now close");

    w.close();
    docs.close();
}

From source file:action.indexing.Fragments.java

License:Apache License

public void docBoostMethod() throws IOException {

    Directory dir = new RAMDirectory();
    IndexWriter writer = new IndexWriter(dir, new StandardAnalyzer(Version.LUCENE_30),
            IndexWriter.MaxFieldLength.UNLIMITED);

    // START//w ww  . j av a 2 s .com
    Document doc = new Document();
    String senderEmail = getSenderEmail();
    String senderName = getSenderName();
    String subject = getSubject();
    String body = getBody();
    doc.add(new Field("senderEmail", senderEmail, Field.Store.YES, Field.Index.NOT_ANALYZED));
    doc.add(new Field("senderName", senderName, Field.Store.YES, Field.Index.ANALYZED));
    doc.add(new Field("subject", subject, Field.Store.YES, Field.Index.ANALYZED));
    doc.add(new Field("body", body, Field.Store.NO, Field.Index.ANALYZED));
    String lowerDomain = getSenderDomain().toLowerCase();
    if (isImportant(lowerDomain)) {
        doc.setBoost(1.5F); //1
    } else if (isUnimportant(lowerDomain)) {
        doc.setBoost(0.1F); //2 
    }
    writer.addDocument(doc);
    // END
    writer.close();

    /*
      #1 Good domain boost factor: 1.5
      #2 Bad domain boost factor: 0.1
    */
}

From source file:action.indexing.IndexingTest.java

License:Apache License

protected void setUp() throws Exception { //1
    directory = new RAMDirectory();

    IndexWriter writer = getWriter(); //2

    for (int i = 0; i < ids.length; i++) { //3
        Document doc = new Document();
        doc.add(new Field("id", ids[i], Field.Store.YES, Field.Index.NOT_ANALYZED));
        doc.add(new Field("country", unindexed[i], Field.Store.YES, Field.Index.NO));
        doc.add(new Field("contents", unstored[i], Field.Store.NO, Field.Index.ANALYZED));
        doc.add(new Field("city", text[i], Field.Store.YES, Field.Index.ANALYZED));
        writer.addDocument(doc);//w  w  w . j  ava  2s . co  m
    }
    writer.close();
}