Example usage for org.apache.lucene.index IndexWriterConfig setSimilarity

Introduction

In this page you can find the example usage for org.apache.lucene.index IndexWriterConfig setSimilarity.

Prototype

public IndexWriterConfig setSimilarity(Similarity similarity)

Source Link

Document

Expert: set the Similarity implementation used by this IndexWriter.

Usage

From source file:MakeLuceneIndex.java

License:Apache License

/** Index all text files under a directory. 
 * @throws UnsupportedEncodingException 
 * @throws FileNotFoundException */
public static void main(String[] args) throws FileNotFoundException, UnsupportedEncodingException {
    String baseDir = "/home/chrisschaefer/";
    //String wikiDumpFile = "Downloads/enwiki-20130604-pages-articles.xml.bz2";
    String wikiDumpFile = "enwiki-20130604-pages-articlese.xml.bz2";
    String luceneIndexName = "enwiki-20130604-lucene2";

    System.currentTimeMillis();//  w  ww. j  av  a 2 s.  c  om
    boolean bIgnoreStubs = false;

    for (int i = 0; i < args.length; ++i) {
        if (args[i].equals("-luceneindex"))
            luceneIndexName = args[++i];

        if (args[i].equals("-basedir"))
            baseDir = args[++i];

        if (args[i].equals("-dumpfile"))
            wikiDumpFile = args[++i];

        if (args[i].equals("-includestubs"))
            bIgnoreStubs = true;
    }
    String rawTextPath = baseDir + luceneIndexName + "-raw-text.txt";
    String logPath = baseDir + luceneIndexName + ".log";
    PrintWriter artikelTextWriter = new PrintWriter(rawTextPath, "UTF-8");
    PrintWriter logger = new PrintWriter(logPath, "UTF-8");
    logger.println("Indexing to directory '" + baseDir + luceneIndexName + "'");
    System.out.println("Indexing to directory '" + baseDir + luceneIndexName + "'");

    Date start = new Date();

    try {

        Directory dir = FSDirectory.open(new File(baseDir + luceneIndexName));

        Analyzer analyzer = new WikipediaAnalyzer();
        //         Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_43, analyzer);

        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(OpenMode.CREATE);
        iwc.setSimilarity(new ESASimilarity());

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmxm or -Xmx1g):
        //
        iwc.setRAMBufferSizeMB(2000.0);

        IndexWriter writer = new IndexWriter(dir, iwc);

        Extractor wikidumpExtractor = new Extractor(baseDir + File.separator + wikiDumpFile);
        wikidumpExtractor.setLinkSeparator("_");
        wikidumpExtractor.setCategorySeparator("_");
        wikidumpExtractor.setTitleSeparator(" ");

        int iStubs = 0;
        int iArticleCount = 0;
        int iSkippedPageCount = 0;
        long iStartTime = java.lang.System.nanoTime();
        long iTime = iStartTime;

        while (wikidumpExtractor.nextPage()) {
            if (wikidumpExtractor.getPageType() != Extractor.PageType.ARTICLE) {
                ++iSkippedPageCount;
                continue;
            }

            if (bIgnoreStubs && wikidumpExtractor.getStub()) {
                ++iStubs;
                continue;
            }

            // skip pages with less than 5 out links
            if (wikidumpExtractor.getPageLinkList(true).size() < 5) {
                ++iSkippedPageCount;
                continue;
            }
            if (wikidumpExtractor.getPageCategories().equals("")) {
                ++iSkippedPageCount;
                logger.println("skipped because of stop category: " + wikidumpExtractor.getPageTitle(false));
                continue;
            } else {
                for (String link : wikidumpExtractor.getPageLinkList(false)) {
                    //                    artikelTextWriter.println(link);
                    if (_inLinks.containsKey(link)) {
                        int tmp = _inLinks.get(link);
                        tmp++;
                        _inLinks.put(link, tmp);
                    } else {
                        _inLinks.put(link, 1);
                    }
                }
            }
            if (wikidumpExtractor.getPageText().equals("")) {
                ++iSkippedPageCount;
                continue;
            }
            artikelTextWriter.println(
                    wikidumpExtractor.getPageTitle(false) + "\t" + wikidumpExtractor.getPageText(false));

            ++iArticleCount;

            if (iArticleCount % 1000 == 0) {
                logger.println(new Date().toString() + " phase 1 -- iArticleCount: " + iArticleCount
                        + " iSkippedPageCount: " + iSkippedPageCount);
            }
        }
        artikelTextWriter.close();
        iArticleCount = 0;

        PrintWriter artikelInLinkWriter = new PrintWriter(baseDir + luceneIndexName + "-inlinks.txt", "UTF-8");
        BufferedReader br = new BufferedReader(new FileReader(rawTextPath));
        String line = br.readLine();

        while (line != null) {
            int endOfTitle = line.indexOf("\t");
            String title = line.substring(0, endOfTitle);
            if (_inLinks.containsKey(title)) {
                int inlinks = _inLinks.get(title);
                artikelInLinkWriter.println(title + "\t" + inlinks);
                if (inlinks > 4) {
                    //System.out.println("inlinks > 0 ");
                    Document doc = new Document();
                    ++iArticleCount;

                    //                    wikidumpExtractor.setTitleSeparator( "_" );
                    //                    doc.add( new TextField( "url_title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) );

                    // doc.add( new TextField( "title", wikidumpExtractor.getPageTitle( false ), Field.Store.YES) );
                    //doc.add(new LongField("wiki_id", wikidumpExtractor.getPageId(), Field.Store.YES));
                    doc.add(new TextField("contents", title + " " + title + " " + title + " " + title + " "
                            + line.substring(endOfTitle + 1), Field.Store.NO));
                    //                  System.out.println(title + " " + 
                    //                        title + " " + 
                    //                        title + " " + 
                    //                        title + " " +
                    //                        line.substring(endOfTitle+1));

                    writer.addDocument(doc);

                    if (iArticleCount % 1000 == 0) {
                        writer.commit();
                        logger.println(new Date().toString() + " phase 2 -- iArticleCount: " + iArticleCount
                                + " iSkippedPageCount: " + iSkippedPageCount);
                    }
                }
            } else {
                artikelInLinkWriter.println(title + "\t0");
            }
            line = br.readLine();
        }
        br.close();
        artikelInLinkWriter.close();

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        writer.commit();
        writer.forceMerge(1);
        writer.close();

        Date end = new Date();
        String endStatement = end.getTime() - start.getTime() + " total milliseconds ("
                + (end.getTime() - start.getTime()) / 3600000.0 + " hours), " + iArticleCount + " Articles.";
        logger.println(endStatement);
        System.out.println(endStatement);
        logger.close();
    } catch (Exception e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:IrqaQuery.java

License:Apache License

public static void makeIndexWriter(String indexPath, String stopPath, String sim) throws IOException {
    System.out.println("[makeIndexWriter] started");
    System.out.println("[makeIndexWriter]" + stopPath);
    Directory dir = FSDirectory.open(Paths.get(indexPath));
    Analyzer analyzer = new EnglishAnalyzer(StopFilter.makeStopSet(mygetStopwords(stopPath)));
    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

    if (sim.equals("TFIDF"))
        iwc.setSimilarity(new ClassicSimilarity());
    else if (sim.equals("BM25"))
        iwc.setSimilarity(new BM25Similarity());
    else// w w  w .j a v a  2s .  c o  m
        iwc.setSimilarity(new BM25Similarity());

    writer = new IndexWriter(dir, iwc);
}

From source file:alix.lucene.Alix.java

License:Open Source License

/**
 * Start to scan the glob of xml files/*  ww  w.  j a  v a2 s .c o  m*/
 * 
 * @param indexDir where the lucene indexes are generated
 * @param anAnalyzer Analyzer to use for analyzed fields
 * @param similarity instance of Similarity to work with the writer
 * @throws TransformerConfigurationException 
 */
static public void walk(String xmlGlob, String xslFile, String indexDir)
        throws IOException, TransformerConfigurationException {

    info("Lucene, src:" + xmlGlob + " parser:" + xslFile + " index:" + indexDir);

    Path srcDir = Paths.get(xmlGlob);
    PathMatcher glob = FileSystems.getDefault().getPathMatcher("glob:*.xml");
    if (!Files.isDirectory(srcDir)) {
        String pattern = srcDir.getFileName().toString();
        glob = FileSystems.getDefault().getPathMatcher("glob:" + pattern);
        srcDir = srcDir.getParent();
    }
    if (!Files.isDirectory(srcDir)) {
        fatal("FATAL " + srcDir + " NOT FOUND");
    }

    Path indexPath = Paths.get(indexDir);
    Files.createDirectories(indexPath);
    Directory dir = FSDirectory.open(indexPath);

    // TODO configure analyzers
    Analyzer analyzer = new XmlAnalyzer();
    IndexWriterConfig conf = new IndexWriterConfig(analyzer);
    conf.setOpenMode(OpenMode.CREATE_OR_APPEND);
    conf.setSimilarity(new BM25Similarity());
    conf.setCodec(new ChapitreCodec());
    // Optional: for better indexing performance, if you
    // are indexing many documents, increase the RAM
    // buffer.  But if you do this, increase the max heap
    // size to the JVM (eg add -Xmx512m or -Xmx1g):
    //
    // conf.setRAMBufferSizeMB(256.0);
    lucwriter = new IndexWriter(dir, conf);

    System.setProperty("javax.xml.transform.TransformerFactory", "net.sf.saxon.TransformerFactoryImpl");
    TransformerFactory tf = TransformerFactory.newInstance();
    tf.setAttribute("http://saxon.sf.net/feature/version-warning", Boolean.FALSE);
    tf.setAttribute("http://saxon.sf.net/feature/recoveryPolicy", new Integer(0));
    parser = tf.newTransformer(new StreamSource(xslFile));

    final PathMatcher matcher = glob; // transmit the matcher by a final variable to the anonymous class
    Files.walkFileTree(srcDir, new SimpleFileVisitor<Path>() {
        @Override
        public FileVisitResult visitFile(Path path, BasicFileAttributes attrs) {
            if (path.getFileName().toString().startsWith("."))
                return FileVisitResult.CONTINUE;
            if (!matcher.matches(path.getFileName()))
                return FileVisitResult.CONTINUE;
            parse(path);
            return FileVisitResult.CONTINUE;
        }

        public FileVisitResult preVisitDirectory(Path path, BasicFileAttributes attrs) {
            // .git, .svn
            if (path.getFileName().toString().startsWith("."))
                return FileVisitResult.SKIP_SUBTREE;
            return FileVisitResult.CONTINUE;
        }
    });

    lucwriter.commit();
    // NOTE: if you want to maximize search performance,
    // you can optionally call forceMerge here.  This can be
    // a terribly costly operation, so generally it's only
    // worth it when your index is relatively static (ie
    // you're done adding documents to it):
    //
    lucwriter.forceMerge(1);
    lucwriter.close();
}

From source file:apps.LuceneIndexer.java

License:Apache License

public static void main(String[] args) {
    Options options = new Options();

    options.addOption("i", null, true, "input file");
    options.addOption("o", null, true, "output directory");
    options.addOption("r", null, true, "optional output TREC-format QREL file");

    options.addOption("bm25_b", null, true, "BM25 parameter: b");
    options.addOption("bm25_k1", null, true, "BM25 parameter: k1");
    options.addOption("bm25fixed", null, false, "use the fixed BM25 similarity");

    Joiner commaJoin = Joiner.on(',');
    Joiner spaceJoin = Joiner.on(' ');

    options.addOption("source_type", null, true,
            "document source type: " + commaJoin.join(SourceFactory.getDocSourceList()));

    // If you increase this value, you may need to modify the following line in *.sh file
    // export MAVEN_OPTS="-Xms8192m -server"
    double ramBufferSizeMB = 1024 * 8; // 8 GB

    CommandLineParser parser = new org.apache.commons.cli.GnuParser();

    IndexWriter indexWriter = null;/*www.  ja va  2s . c om*/
    BufferedWriter qrelWriter = null;

    int docNum = 0;

    try {
        CommandLine cmd = parser.parse(options, args);

        String inputFileName = null, outputDirName = null, qrelFileName = null;

        if (cmd.hasOption("i")) {
            inputFileName = cmd.getOptionValue("i");
        } else {
            Usage("Specify 'input file'", options);
        }

        if (cmd.hasOption("o")) {
            outputDirName = cmd.getOptionValue("o");
        } else {
            Usage("Specify 'index directory'", options);
        }

        if (cmd.hasOption("r")) {
            qrelFileName = cmd.getOptionValue("r");
        }

        String sourceName = cmd.getOptionValue("source_type");

        if (sourceName == null)
            Usage("Specify document source type", options);

        if (qrelFileName != null)
            qrelWriter = new BufferedWriter(new FileWriter(qrelFileName));

        File outputDir = new File(outputDirName);
        if (!outputDir.exists()) {
            if (!outputDir.mkdirs()) {
                System.out.println("couldn't create " + outputDir.getAbsolutePath());
                System.exit(1);
            }
        }
        if (!outputDir.isDirectory()) {
            System.out.println(outputDir.getAbsolutePath() + " is not a directory!");
            System.exit(1);
        }
        if (!outputDir.canWrite()) {
            System.out.println("Can't write to " + outputDir.getAbsolutePath());
            System.exit(1);
        }

        boolean useFixedBM25 = cmd.hasOption("bm25fixed");

        float bm25_k1 = UtilConst.BM25_K1_DEFAULT, bm25_b = UtilConst.BM25_B_DEFAULT;

        if (cmd.hasOption("bm25_k1")) {
            try {
                bm25_k1 = Float.parseFloat(cmd.getOptionValue("bm25_k1"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_k1'", options);
            }
        }

        if (cmd.hasOption("bm25_b")) {
            try {
                bm25_b = Float.parseFloat(cmd.getOptionValue("bm25_b"));
            } catch (NumberFormatException e) {
                Usage("Wrong format for 'bm25_b'", options);
            }
        }

        EnglishAnalyzer analyzer = new EnglishAnalyzer();
        FSDirectory indexDir = FSDirectory.open(Paths.get(outputDirName));
        IndexWriterConfig indexConf = new IndexWriterConfig(analyzer);

        /*
            OpenMode.CREATE creates a new index or overwrites an existing one.
            https://lucene.apache.org/core/6_0_0/core/org/apache/lucene/index/IndexWriterConfig.OpenMode.html#CREATE
        */
        indexConf.setOpenMode(OpenMode.CREATE);
        indexConf.setRAMBufferSizeMB(ramBufferSizeMB);

        System.out.println(String.format("BM25 parameters k1=%f b=%f ", bm25_k1, bm25_b));

        if (useFixedBM25) {
            System.out.println(String.format("Using fixed BM25Simlarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25SimilarityFix(bm25_k1, bm25_b));
        } else {
            System.out.println(String.format("Using Lucene BM25Similarity, k1=%f b=%f", bm25_k1, bm25_b));
            indexConf.setSimilarity(new BM25Similarity(bm25_k1, bm25_b));
        }

        indexWriter = new IndexWriter(indexDir, indexConf);

        DocumentSource inpDocSource = SourceFactory.createDocumentSource(sourceName, inputFileName);
        DocumentEntry inpDoc = null;
        TextCleaner textCleaner = new TextCleaner(null);

        while ((inpDoc = inpDocSource.next()) != null) {
            ++docNum;

            Document luceneDoc = new Document();
            ArrayList<String> cleanedToks = textCleaner.cleanUp(inpDoc.mDocText);
            String cleanText = spaceJoin.join(cleanedToks);

            //        System.out.println(inpDoc.mDocId);
            //        System.out.println(cleanText);
            //        System.out.println("==============================");

            luceneDoc.add(new StringField(UtilConst.FIELD_ID, inpDoc.mDocId, Field.Store.YES));
            luceneDoc.add(new TextField(UtilConst.FIELD_TEXT, cleanText, Field.Store.YES));
            indexWriter.addDocument(luceneDoc);

            if (inpDoc.mIsRel != null && qrelWriter != null) {
                saveQrelOneEntry(qrelWriter, inpDoc.mQueryId, inpDoc.mDocId, inpDoc.mIsRel ? MAX_GRADE : 0);
            }
            if (docNum % 1000 == 0)
                System.out.println(String.format("Indexed %d documents", docNum));

        }

    } catch (ParseException e) {
        e.printStackTrace();
        Usage("Cannot parse arguments" + e, options);
    } catch (Exception e) {
        System.err.println("Terminating due to an exception: " + e);
        System.exit(1);
    } finally {
        System.out.println(String.format("Indexed %d documents", docNum));

        try {
            if (null != indexWriter)
                indexWriter.close();
            if (null != qrelWriter)
                qrelWriter.close();
        } catch (IOException e) {
            System.err.println("IO exception: " + e);
            e.printStackTrace();
        }
    }
}

From source file:biospectra.index.Indexer.java

License:Apache License

private void initialize(File indexPath, int kmerSize, boolean minStrandKmer, Similarity similarity,
        int workerThreads, int ramBufferSize) throws Exception {
    if (!indexPath.exists()) {
        indexPath.mkdirs();/*w  w w.  jav  a 2 s .c o  m*/
    }

    if (indexPath.exists()) {
        cleanUpDirectory(indexPath);
    }

    this.indexPath = indexPath;
    this.minStrandKmer = minStrandKmer;
    this.analyzer = new KmerIndexAnalyzer(kmerSize, minStrandKmer);
    Directory dir = new MMapDirectory(this.indexPath.toPath());
    IndexWriterConfig config = new IndexWriterConfig(this.analyzer);
    if (similarity != null) {
        config.setSimilarity(similarity);
    }

    this.workerThreads = workerThreads;

    if (ramBufferSize > 0) {
        config.setRAMBufferSizeMB(ramBufferSize);
    }

    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    this.indexWriter = new IndexWriter(dir, config);

    this.executor = new BlockingExecutor(this.workerThreads, this.workerThreads * 2);

    for (int i = 0; i < this.workerThreads; i++) {
        Document doc = new Document();
        Field filenameField = new StringField(IndexConstants.FIELD_FILENAME, "", Field.Store.YES);
        Field headerField = new StringField(IndexConstants.FIELD_HEADER, "", Field.Store.YES);
        Field sequenceDirectionField = new StringField(IndexConstants.FIELD_SEQUENCE_DIRECTION, "",
                Field.Store.YES);
        Field taxonTreeField = new StringField(IndexConstants.FIELD_TAXONOMY_TREE, "", Field.Store.YES);
        Field sequenceField = new TextField(IndexConstants.FIELD_SEQUENCE, "", Field.Store.NO);

        doc.add(filenameField);
        doc.add(headerField);
        doc.add(sequenceDirectionField);
        doc.add(taxonTreeField);
        doc.add(sequenceField);

        this.freeQueue.offer(doc);
    }
}

From source file:br.usp.icmc.gazetteer.SemanticSearchTest.LuceneSearcher.java

License:Open Source License

public void similiaridade1() throws IOException {

    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, a);
    conf.setSimilarity(similarityltc());
    writer = new IndexWriter(dir, conf);
}

From source file:br.usp.icmc.gazetteer.SemanticSearchTest.LuceneSearcher.java

License:Open Source License

public void similaridade2() throws IOException {
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, a);
    conf.setSimilarity(similaritylnc());
    writer = new IndexWriter(dir, conf);
}

From source file:br.usp.icmc.gazetteer.SemanticSearchTest.LuceneSearcher.java

License:Open Source License

public void similaridade3() throws IOException {
    IndexWriterConfig conf = new IndexWriterConfig(Version.LUCENE_36, a);
    conf.setSimilarity(similaritynnc());
    writer = new IndexWriter(dir, conf);
}

From source file:cn.hbu.cs.esearch.index.DiskSearchIndex.java

License:Apache License

/**
 * Opens an index modifier.//from ww w. jav a  2 s.  c  o  m
 * @param analyzer Analyzer
 * @return IndexModifer instance
 */
@Override
public IndexWriter openIndexWriter(Analyzer analyzer, Similarity similarity) throws IOException {
    if (_indexWriter != null) {
        return _indexWriter;
    }

    Directory directory = _dirMgr.getDirectory(true);
    log.info("opening index writer at: " + _dirMgr.getPath());

    EsearchMergePolicy mergePolicy = new EsearchMergePolicy();
    mergePolicy.setMergePolicyParams(_mergePolicyParams);

    // hao: autocommit is set to false with this constructor
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, analyzer);
    config.setOpenMode(OpenMode.CREATE_OR_APPEND);
    _deletionPolicy = new ZoieIndexDeletionPolicy();
    config.setIndexDeletionPolicy(_deletionPolicy);
    config.setMergeScheduler(_mergeScheduler);
    config.setMergePolicy(mergePolicy);
    config.setReaderPooling(false);
    if (similarity != null) {
        config.setSimilarity(similarity);
    }
    config.setRAMBufferSizeMB(5);
    IndexWriter idxWriter = new IndexWriter(directory, config);

    // we need retrieve deletionPolicy from IndexWriter since deletionPolicy is deep cloned
    _deletionPolicy = (ZoieIndexDeletionPolicy) (idxWriter.getConfig().getIndexDeletionPolicy());
    _indexWriter = idxWriter;
    return idxWriter;
}

From source file:cn.hbu.cs.esearch.index.RAMSearchIndex.java

License:Apache License

@Override
public IndexWriter openIndexWriter(Analyzer analyzer, Similarity similarity) throws IOException {

    if (_indexWriter != null) {
        return _indexWriter;
    }//from  w  w w .  ja v  a2s  . c o  m

    EsearchMergePolicy mergePolicy = new EsearchMergePolicy();
    mergePolicy.setMergePolicyParams(_mergePolicyParams);
    mergePolicy.setUseCompoundFile(false);

    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_43, analyzer);
    config.setOpenMode(OpenMode.CREATE_OR_APPEND);
    config.setMergeScheduler(_mergeScheduler);
    config.setMergePolicy(mergePolicy);
    config.setReaderPooling(false);
    if (similarity != null) {
        config.setSimilarity(similarity);
    }
    config.setRAMBufferSizeMB(3);

    IndexWriter idxWriter = new IndexWriter(_directory, config);
    _indexWriter = idxWriter;
    return idxWriter;
}