Example usage for org.apache.lucene.search.similarities LMDirichletSimilarity LMDirichletSimilarity

List of usage examples for org.apache.lucene.search.similarities LMDirichletSimilarity LMDirichletSimilarity

Introduction

In this page you can find the example usage for org.apache.lucene.search.similarities LMDirichletSimilarity LMDirichletSimilarity.

Prototype

public LMDirichletSimilarity() 

Source Link

Document

Instantiates the similarity with the default μ value of 2000.

Usage

From source file:com.github.tteofili.looseen.Test20NewsgroupsClassification.java

License:Apache License

@Test
public void test20Newsgroups() throws Exception {

    String indexProperty = System.getProperty("index");
    if (indexProperty != null) {
        try {//w w  w  . jav a  2s  . c  om
            index = Boolean.valueOf(indexProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    String splitProperty = System.getProperty("split");
    if (splitProperty != null) {
        try {
            split = Boolean.valueOf(splitProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    Path mainIndexPath = Paths.get(INDEX + "/original");
    Directory directory = FSDirectory.open(mainIndexPath);
    Path trainPath = Paths.get(INDEX + "/train");
    Path testPath = Paths.get(INDEX + "/test");
    Path cvPath = Paths.get(INDEX + "/cv");
    FSDirectory cv = null;
    FSDirectory test = null;
    FSDirectory train = null;
    IndexReader testReader = null;
    if (split) {
        cv = FSDirectory.open(cvPath);
        test = FSDirectory.open(testPath);
        train = FSDirectory.open(trainPath);
    }

    if (index) {
        delete(mainIndexPath);
        if (split) {
            delete(trainPath, testPath, cvPath);
        }
    }

    IndexReader reader = null;
    List<Classifier<BytesRef>> classifiers = new LinkedList<>();
    try {
        Analyzer analyzer = new StandardAnalyzer();
        if (index) {

            System.out.format("Indexing 20 Newsgroups...%n");

            long startIndex = System.currentTimeMillis();
            IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer));

            buildIndex(new File(PREFIX + "/20n/20_newsgroups"), indexWriter);

            long endIndex = System.currentTimeMillis();
            System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(),
                    (endIndex - startIndex) / 1000);

            indexWriter.close();

        }

        if (split && !index) {
            reader = DirectoryReader.open(train);
        } else {
            reader = DirectoryReader.open(directory);
        }

        if (index && split) {
            // split the index
            System.out.format("Splitting the index...%n");

            long startSplit = System.currentTimeMillis();
            DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0);
            datasetSplitter.split(reader, train, test, cv, analyzer, false, CATEGORY_FIELD, BODY_FIELD,
                    SUBJECT_FIELD, CATEGORY_FIELD);
            reader.close();
            reader = DirectoryReader.open(train); // using the train index from now on
            long endSplit = System.currentTimeMillis();
            System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000);
        }

        final long startTime = System.currentTimeMillis();

        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD,
                BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new AxiomaticF1EXP(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new AxiomaticF1LOG(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3,
                1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer,
                null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 3, 1, 1, CATEGORY_FIELD,
                BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()),
                analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 15, 1, 100));
        classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 30, 3, 300));
        classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 10, 1, 100));
        classifiers.add(new KNearestFuzzyClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer, null,
                1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader,
                new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null,
                1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers
                .add(new KNearestFuzzyClassifier(reader, null, analyzer, null, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers
                .add(new KNearestFuzzyClassifier(reader, null, analyzer, null, 3, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new AxiomaticF1EXP(), analyzer, null, 3,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new AxiomaticF1LOG(), analyzer, null, 3,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD));

        int maxdoc;

        if (split) {
            testReader = DirectoryReader.open(test);
            maxdoc = testReader.maxDoc();
        } else {
            maxdoc = reader.maxDoc();
        }

        System.out.format("Starting evaluation on %d docs...%n", maxdoc);

        ExecutorService service = Executors.newCachedThreadPool();
        List<Future<String>> futures = new LinkedList<>();
        for (Classifier<BytesRef> classifier : classifiers) {
            testClassifier(reader, startTime, testReader, service, futures, classifier);
        }
        for (Future<String> f : futures) {
            System.out.println(f.get());
        }

        Thread.sleep(10000);
        service.shutdown();

    } finally {
        if (reader != null) {
            reader.close();
        }
        directory.close();
        if (test != null) {
            test.close();
        }
        if (train != null) {
            train.close();
        }
        if (cv != null) {
            cv.close();
        }
        if (testReader != null) {
            testReader.close();
        }

        for (Classifier c : classifiers) {
            if (c instanceof Closeable) {
                ((Closeable) c).close();
            }
        }
    }
}

From source file:com.github.tteofili.looseen.TestWikipediaClassification.java

License:Apache License

@Test
public void testItalianWikipedia() throws Exception {

    String indexProperty = System.getProperty("index");
    if (indexProperty != null) {
        try {//from w  ww . j  av a2s.  c  o m
            index = Boolean.valueOf(indexProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    String splitProperty = System.getProperty("split");
    if (splitProperty != null) {
        try {
            split = Boolean.valueOf(splitProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    Path mainIndexPath = Paths.get(INDEX + "/original");
    Directory directory = FSDirectory.open(mainIndexPath);
    Path trainPath = Paths.get(INDEX + "/train");
    Path testPath = Paths.get(INDEX + "/test");
    Path cvPath = Paths.get(INDEX + "/cv");
    FSDirectory cv = null;
    FSDirectory test = null;
    FSDirectory train = null;
    DirectoryReader testReader = null;
    if (split) {
        cv = FSDirectory.open(cvPath);
        test = FSDirectory.open(testPath);
        train = FSDirectory.open(trainPath);
    }

    if (index) {
        delete(mainIndexPath);
        if (split) {
            delete(trainPath, testPath, cvPath);
        }
    }

    IndexReader reader = null;
    try {
        Collection<String> stopWordsList = Arrays.asList("di", "a", "da", "in", "per", "tra", "fra", "il", "lo",
                "la", "i", "gli", "le");
        CharArraySet stopWords = new CharArraySet(stopWordsList, true);
        Analyzer analyzer = new ItalianAnalyzer(stopWords);
        if (index) {

            System.out.format("Indexing Italian Wikipedia...%n");

            long startIndex = System.currentTimeMillis();
            IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer));

            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current1.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current2.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current3.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current4.xml"), indexWriter);

            long endIndex = System.currentTimeMillis();
            System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(),
                    (endIndex - startIndex) / 1000);

            indexWriter.close();

        }

        if (split && !index) {
            reader = DirectoryReader.open(train);
        } else {
            reader = DirectoryReader.open(directory);
        }

        if (index && split) {
            // split the index
            System.out.format("Splitting the index...%n");

            long startSplit = System.currentTimeMillis();
            DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0);
            for (LeafReaderContext context : reader.leaves()) {
                datasetSplitter.split(context.reader(), train, test, cv, analyzer, false, CATEGORY_FIELD,
                        TEXT_FIELD, CATEGORY_FIELD);
            }
            reader.close();
            reader = DirectoryReader.open(train); // using the train index from now on
            long endSplit = System.currentTimeMillis();
            System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000);
        }

        final long startTime = System.currentTimeMillis();

        List<Classifier<BytesRef>> classifiers = new LinkedList<>();
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new BM25Similarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD,
                TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3,
                1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer,
                null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 1, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()),
                analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 10, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 300));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 3, 100));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 3,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));

        int maxdoc;

        if (split) {
            testReader = DirectoryReader.open(test);
            maxdoc = testReader.maxDoc();
        } else {
            maxdoc = reader.maxDoc();
        }

        System.out.format("Starting evaluation on %d docs...%n", maxdoc);

        ExecutorService service = Executors.newCachedThreadPool();
        List<Future<String>> futures = new LinkedList<>();
        for (Classifier<BytesRef> classifier : classifiers) {

            final IndexReader finalReader = reader;
            final DirectoryReader finalTestReader = testReader;
            futures.add(service.submit(() -> {
                ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix;
                if (split) {
                    confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalTestReader, classifier,
                            CATEGORY_FIELD, TEXT_FIELD, 60000 * 30);
                } else {
                    confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalReader, classifier,
                            CATEGORY_FIELD, TEXT_FIELD, 60000 * 30);
                }

                final long endTime = System.currentTimeMillis();
                final int elapse = (int) (endTime - startTime) / 1000;

                return " * " + classifier + " \n    * accuracy = " + confusionMatrix.getAccuracy()
                        + "\n    * precision = " + confusionMatrix.getPrecision() + "\n    * recall = "
                        + confusionMatrix.getRecall() + "\n    * f1-measure = " + confusionMatrix.getF1Measure()
                        + "\n    * avgClassificationTime = " + confusionMatrix.getAvgClassificationTime()
                        + "\n    * time = " + elapse + " (sec)\n ";
            }));

        }
        for (Future<String> f : futures) {
            System.out.println(f.get());
        }

        Thread.sleep(10000);
        service.shutdown();

    } finally {
        try {
            if (reader != null) {
                reader.close();
            }
            if (directory != null) {
                directory.close();
            }
            if (test != null) {
                test.close();
            }
            if (train != null) {
                train.close();
            }
            if (cv != null) {
                cv.close();
            }
            if (testReader != null) {
                testReader.close();
            }
        } catch (Throwable e) {
            e.printStackTrace();
        }
    }
}

From source file:com.o19s.es.ltr.query.LtrQueryTests.java

License:Apache License

@Before
public void setupIndex() throws IOException {
    dirUnderTest = newDirectory();/*from  w  ww  . j a va 2  s.c o m*/
    List<Similarity> sims = Arrays.asList(new ClassicSimilarity(), new SweetSpotSimilarity(), // extends Classic
            new BM25Similarity(), new LMDirichletSimilarity(), new BooleanSimilarity(),
            new LMJelinekMercerSimilarity(0.2F), new AxiomaticF3LOG(0.5F, 10),
            new DFISimilarity(new IndependenceChiSquared()),
            new DFRSimilarity(new BasicModelBE(), new AfterEffectB(), new NormalizationH1()),
            new IBSimilarity(new DistributionLL(), new LambdaDF(), new NormalizationH3()));
    similarity = sims.get(random().nextInt(sims.size()));

    indexWriterUnderTest = new RandomIndexWriter(random(), dirUnderTest,
            newIndexWriterConfig().setSimilarity(similarity));
    for (int i = 0; i < docs.length; i++) {
        Document doc = new Document();
        doc.add(newStringField("id", "" + i, Field.Store.YES));
        doc.add(newField("field", docs[i], Store.YES));
        indexWriterUnderTest.addDocument(doc);
    }
    indexWriterUnderTest.commit();
    indexWriterUnderTest.forceMerge(1);
    indexWriterUnderTest.flush();

    indexReaderUnderTest = indexWriterUnderTest.getReader();
    searcherUnderTest = newSearcher(indexReaderUnderTest);
    searcherUnderTest.setSimilarity(similarity);
}

From source file:org.archive.search.IndexSearch.java

License:Apache License

/**
 * /*from  www . j  ava2 s .  com*/
 * **/
public static ArrayList<ResultSlot> initialLuceneSearch(SimType simType, String searchQuery, int slotNumber)
        throws Exception {
    // String queryStr = "apple";
    //int resultNum = 10;
    //String field = "content";
    if (!solrIni) {
        solrIndexReader = DirectoryReader.open(FSDirectory.open(new File(solrIndexDir)));
        solrSearcher = new IndexSearcher(solrIndexReader);

        if (simType == SimType.LM) {
            solrSimilarity = new LMDirichletSimilarity();
            solrSearcher.setSimilarity(solrSimilarity);
        } else if (simType == SimType.TFIDF) {
            //use default
        } else {
            System.err.println("SimType Input Error!");
            System.exit(1);
        }

        solrAnalyzer = new StandardAnalyzer(Version.LUCENE_48);
        //solrParser = new QueryParser(Version.LUCENE_48, field, solrAnalyzer);
        solrParser = new MultiFieldQueryParser(Version.LUCENE_48, new String[] { "title", "content" },
                solrAnalyzer);

        solrIni = true;
    }

    Query query = solrParser.parse(searchQuery);

    // Collect enough docs to show 5 pages
    TopDocs resultList = solrSearcher.search(query, slotNumber);
    ScoreDoc[] hitList = resultList.scoreDocs;

    ArrayList<ResultSlot> slotList = new ArrayList<>();
    for (int i = 0; i < hitList.length; i++) {
        ScoreDoc hit = hitList[i];

        //

        Document doc = solrSearcher.doc(hit.doc);

        //

        String id = doc.get("id");

        slotList.add(new ResultSlot(id, (i + 1), hit.score));
    }

    if (debug) {
        System.out.println("search results:");
        System.out.println();
        for (ScoreDoc hit : hitList) {
            System.out.println("doc=" + hit.doc + " score=" + hit.score);
            Document doc = solrSearcher.doc(hit.doc);
            String id = doc.get("id");
            System.out.println("id\t" + id);
            System.out.println("-------- lp file -------");
            System.out.println(fetchLPFile(id).get("text"));
            System.out.println();
        }
    }

    return slotList;
}

From source file:org.archive.search.IndexSearch.java

License:Apache License

private static void getTop10Results(TemRunType runType) throws Exception {
    //queries//from w w  w.  ja v  a2 s .c o  m
    String qFile;

    BufferedWriter top20IDWriter;
    BufferedWriter top20SolrWriter;
    BufferedWriter top20CheckWriter;

    if (runType == TemRunType.DryRun) {
        qFile = TDirectory.ROOT_DATASET
                + "Temporalia/DryRun/ntcir11_Temporalia_ntcir11-temporalia-tqic-dryrun.txt";

        top20IDWriter = IOBox.getBufferedWriter_UTF8(
                TDirectory.ROOT_OUTPUT + "/top10/idmap_" + TemRunType.DryRun.toString() + ".txt");
        top20SolrWriter = IOBox.getBufferedWriter_UTF8(
                TDirectory.ROOT_OUTPUT + "/top10/solr_" + TemRunType.DryRun.toString() + ".txt");
        top20CheckWriter = IOBox.getBufferedWriter_UTF8(
                TDirectory.ROOT_OUTPUT + "/top10/check_" + TemRunType.DryRun.toString() + ".txt");
    } else {
        qFile = TDirectory.ROOT_DATASET
                + "Temporalia/FormalRun/ntcir11_Temporalia_NTCIR-11TQICQueriesFormalRun.txt";

        top20IDWriter = IOBox.getBufferedWriter_UTF8(
                TDirectory.ROOT_OUTPUT + "/top10/idmap_" + TemRunType.FormalRun.toString() + ".txt");
        top20SolrWriter = IOBox.getBufferedWriter_UTF8(
                TDirectory.ROOT_OUTPUT + "/top10/solr_" + TemRunType.FormalRun.toString() + ".txt");
        top20CheckWriter = IOBox.getBufferedWriter_UTF8(
                TDirectory.ROOT_OUTPUT + "/top10/check_" + TemRunType.FormalRun.toString() + ".txt");
    }

    ArrayList<String> lineList = IOBox.getLinesAsAList_UTF8(qFile);

    //build a standard pseudo-xml file
    StringBuffer buffer = new StringBuffer();
    buffer.append("<add>");
    for (String line : lineList) {
        buffer.append(TemLoader.stripNonValidXMLCharacters(line));
    }
    buffer.append("</add>");

    SAXBuilder saxBuilder = new SAXBuilder();
    org.jdom.Document xmlDoc = saxBuilder
            .build(new InputStreamReader(new ByteArrayInputStream(buffer.toString().getBytes("UTF-8"))));
    Element webtrackElement = xmlDoc.getRootElement();
    List<Element> queryList = webtrackElement.getChildren("query");

    ArrayList<StrStr> qList = new ArrayList<>();
    for (Element query : queryList) {
        qList.add(new StrStr(query.getChildText("id").trim(), query.getChildText("query_string").trim()));
    }

    //solr search
    solrIndexReader = DirectoryReader.open(FSDirectory.open(new File(solrIndexDir)));
    solrSearcher = new IndexSearcher(solrIndexReader);
    solrSimilarity = new LMDirichletSimilarity();
    solrSearcher.setSimilarity(solrSimilarity);
    solrAnalyzer = new StandardAnalyzer(Version.LUCENE_48);
    solrParser = new MultiFieldQueryParser(Version.LUCENE_48, new String[] { "title", "content" },
            solrAnalyzer);

    //check search
    lpIndexReader = DirectoryReader.open(FSDirectory.open(new File(lpIndexDir)));
    lpSearcher = new IndexSearcher(lpIndexReader);

    //

    int count = 1;
    for (StrStr q : qList) {
        System.out.println((count++));
        //1
        Query solrQuery = solrParser.parse(q.second);
        TopDocs solrResultList = solrSearcher.search(solrQuery, 20);
        ScoreDoc[] solrHitList = solrResultList.scoreDocs;

        ArrayList<String> docidList = new ArrayList<>();

        for (int i = 0; i < solrHitList.length; i++) {
            ScoreDoc solrHit = solrHitList[i];
            Document doc = solrSearcher.doc(solrHit.doc);
            String docid = doc.get("id");
            docidList.add(docid);
        }

        //id map
        top20IDWriter.write(q.first);
        top20IDWriter.newLine();
        for (String docid : docidList) {
            top20IDWriter.write("\t" + docid);
            top20IDWriter.newLine();
        }

        //solr doc
        for (int i = 0; i < solrHitList.length; i++) {
            ScoreDoc solrHit = solrHitList[i];
            Document solrDoc = solrSearcher.doc(solrHit.doc);
            top20SolrWriter.write(TemLoader.toSolrXml(solrDoc));
            top20SolrWriter.newLine();
        }

        //check doc
        for (String docid : docidList) {
            Query checkQuery = lpParser.parse(docid);
            TopDocs checkResults = lpSearcher.search(checkQuery, 2);
            ScoreDoc[] checkHits = checkResults.scoreDocs;
            Document checkDoc = lpSearcher.doc(checkHits[0].doc);

            top20CheckWriter.write(TemLoader.toCheckXml(checkDoc));
            top20CheckWriter.newLine();
        }
    }

    //
    top20IDWriter.flush();
    top20IDWriter.close();

    top20SolrWriter.flush();
    top20SolrWriter.close();

    top20CheckWriter.flush();
    top20CheckWriter.close();
}