Example usage for org.apache.lucene.store FSDirectory close

List of usage examples for org.apache.lucene.store FSDirectory close

Introduction

In this page you can find the example usage for org.apache.lucene.store FSDirectory close.

Prototype

@Override
    public synchronized void close() throws IOException 

Source Link

Usage

From source file:aos.lucene.tools.BooksMoreLikeThis.java

License:Apache License

public static void main(String[] args) throws Throwable {

    String indexDir = System.getProperty("index.dir");
    FSDirectory directory = FSDirectory.open(new File(indexDir));
    IndexReader reader = DirectoryReader.open(directory);

    IndexSearcher searcher = new IndexSearcher(reader);

    int numDocs = reader.maxDoc();

    MoreLikeThis mlt = new MoreLikeThis(reader);
    mlt.setFieldNames(new String[] { "title", "author" });
    mlt.setMinTermFreq(1);//w  w  w . ja  v  a2 s .c om
    mlt.setMinDocFreq(1);

    for (int docID = 0; docID < numDocs; docID++) {
        LOGGER.info();
        Document doc = reader.document(docID);
        LOGGER.info(doc.get("title"));

        Query query = mlt.like(docID);
        LOGGER.info("  query=" + query);

        TopDocs similarDocs = searcher.search(query, 10);
        if (similarDocs.totalHits == 0)
            LOGGER.info("  None like this");
        for (int i = 0; i < similarDocs.scoreDocs.length; i++) {
            if (similarDocs.scoreDocs[i].doc != docID) {
                doc = reader.document(similarDocs.scoreDocs[i].doc);
                LOGGER.info("  -> " + doc.getField("title").stringValue());
            }
        }
    }

    reader.close();
    directory.close();
}

From source file:axiom.db.utils.LuceneManipulator.java

License:Open Source License

public void compress(String dbDir) throws Exception {
    System.setProperty("org.apache.lucene.FSDirectory.class", "org.apache.lucene.store.TransFSDirectory");

    File dbhome = new File(dbDir);
    String url = getUrl(dbhome);/*  w  ww .j a  v  a  2 s  .c o  m*/

    FSDirectory indexDir = FSDirectory.getDirectory(dbhome, false);
    if (indexDir instanceof TransFSDirectory) {
        FSDirectory.setDisableLocks(true);
        TransFSDirectory d = (TransFSDirectory) indexDir;
        d.setDriverClass(DRIVER_CLASS);
        d.setUrl(url);
        d.setUser(null);
        d.setPassword(null);
    }

    File ndbhome = new File(dbhome.getParentFile(), dbhome.getName() + "_tmp");
    File olddbhome = new File(dbhome.getParentFile(), dbhome.getName() + "_old");
    FSDirectory nindexDir = FSDirectory.getDirectory(ndbhome, true);
    if (nindexDir instanceof TransFSDirectory) {
        FSDirectory.setDisableLocks(true);
        TransFSDirectory d = (TransFSDirectory) nindexDir;
        d.setDriverClass(DRIVER_CLASS);
        d.setUrl(url);
        d.setUser(null);
        d.setPassword(null);
    }

    IndexSearcher searcher = null;
    IndexWriter writer = null;
    LuceneManager lmgr = null;

    try {
        searcher = new IndexSearcher(indexDir);
        PerFieldAnalyzerWrapper a = LuceneManager.buildAnalyzer();
        writer = IndexWriterManager.getWriter(nindexDir, a, true);
        final int numDocs = searcher.getIndexReader().numDocs();

        HashSet deldocs = new HashSet();
        HashMap infos = new HashMap();
        for (int i = 0; i < numDocs; i++) {
            Document doc = searcher.doc(i);
            String delprop = doc.get(DeletedInfos.DELETED);
            final String id = doc.get(LuceneManager.ID) + DeletedInfos.KEY_SEPERATOR
                    + doc.get(LuceneManager.LAYER_OF_SAVE);
            if (delprop != null && "true".equals(delprop)) {
                deldocs.add(id);
            } else {
                Object v;
                if ((v = infos.get(id)) == null) {
                    infos.put(id, new Integer(i));
                } else {
                    final String lmod = doc.get(LuceneManager.LASTMODIFIED);
                    final String lmod_prev = searcher.doc(((Integer) v).intValue()).get("_lastmodified");
                    if (lmod_prev == null || (lmod != null && lmod.compareTo(lmod_prev) > 0)) {
                        infos.put(id, new Integer(i));
                    }
                }
            }
        }

        ArrayList listOfMaps = new ArrayList();

        for (int i = 0; i < numDocs; i++) {
            Document doc = searcher.doc(i);
            String delprop = doc.get(DeletedInfos.DELETED);
            String layerStr = doc.get(LuceneManager.LAYER_OF_SAVE);
            int layer = -1;
            try {
                layer = Integer.parseInt(layerStr);
            } catch (Exception ex) {
                layer = -1;
            }
            final String id = doc.get(LuceneManager.ID) + DeletedInfos.KEY_SEPERATOR
                    + doc.get(LuceneManager.LAYER_OF_SAVE);
            if (delprop != null && "true".equals(delprop)) {
                continue;
            } else if (id != null && deldocs.contains(id)) {
                continue;
            }

            Integer idx = (Integer) infos.get(id);
            if (idx != null && i != idx.intValue()) {
                continue;
            }

            Document ndoc = convertDocument(doc);

            if (ndoc != null) {
                writer.addDocument(ndoc);
            }
        }

    } catch (Exception ex) {
        ex.printStackTrace();
        throw new RuntimeException(ex);
    } finally {
        if (searcher != null) {
            try {
                searcher.close();
            } catch (Exception ex) {
            }
        }

        if (lmgr != null) {
            lmgr.shutdown();
            lmgr = null;
        }

        indexDir.close();
        SegmentInfos sinfos = IndexObjectsFactory.getFSSegmentInfos(indexDir);
        sinfos.clear();
        IndexObjectsFactory.removeDeletedInfos(indexDir);
    }

    Connection conn = null;
    boolean exceptionOccured = false;

    try {
        if (writer != null) {
            conn = DriverManager.getConnection(url);
            conn.setAutoCommit(false);
            writer.close();
            writer.flushCache();
            LuceneManager.commitSegments(null, conn, dbhome, writer.getDirectory());
            writer.finalizeTrans();
        }
    } catch (Exception ex) {
        ex.printStackTrace();
        exceptionOccured = true;
        throw new RuntimeException(ex);
    } finally {
        if (conn != null) {
            try {
                if (!conn.getAutoCommit()) {
                    if (!exceptionOccured) {
                        conn.commit();
                    } else {
                        conn.rollback();
                    }
                }
                conn.close();
            } catch (Exception ex) {
                ex.printStackTrace();
            }
            conn = null;
        }

        nindexDir.close();
        SegmentInfos sinfos = IndexObjectsFactory.getFSSegmentInfos(nindexDir);
        sinfos.clear();
        IndexObjectsFactory.removeDeletedInfos(nindexDir);
    }

    File[] files = dbhome.listFiles();
    for (int i = 0; i < files.length; i++) {
        if (!files[i].isDirectory()) {
            files[i].delete();
        }
    }

    files = ndbhome.listFiles();
    for (int i = 0; i < files.length; i++) {
        if (!files[i].isDirectory()) {
            File nfile = new File(dbhome, files[i].getName());
            files[i].renameTo(nfile);
        }
    }

    if (!FileUtils.deleteDir(ndbhome)) {
        throw new Exception("Could not delete " + ndbhome);
    }
}

From source file:axiom.objectmodel.dom.convert.LuceneConvertor.java

License:Open Source License

public void convert(Application app, File dbhome) throws Exception {
    FSDirectory indexDir = FSDirectory.getDirectory(dbhome, false);
    if (indexDir instanceof TransFSDirectory) {
        FSDirectory.setDisableLocks(true);
        TransFSDirectory d = (TransFSDirectory) indexDir;
        TransSource source = app.getTransSource();
        d.setDriverClass(source.getDriverClass());
        d.setUrl(source.getUrl());/*  w w w .  jav a  2 s. c o  m*/
        d.setUser(source.getUser());
        d.setPassword(source.getPassword());
    }
    File ndbhome = new File(dbhome.getParentFile(), dbhome.getName() + "_tmp");
    File olddbhome = new File(dbhome.getParentFile(), dbhome.getName() + "_old");
    FSDirectory nindexDir = FSDirectory.getDirectory(ndbhome, true);
    if (nindexDir instanceof TransFSDirectory) {
        FSDirectory.setDisableLocks(true);
        TransFSDirectory d = (TransFSDirectory) nindexDir;
        TransSource source = app.getTransSource();
        d.setDriverClass(source.getDriverClass());
        d.setUrl(source.getUrl());
        d.setUser(source.getUser());
        d.setPassword(source.getPassword());
    }

    IndexSearcher searcher = null;
    IndexWriter writer = null;
    LuceneManager lmgr = null;

    try {
        searcher = new IndexSearcher(indexDir);
        PerFieldAnalyzerWrapper a = LuceneManager.buildAnalyzer();
        writer = IndexWriterManager.getWriter(nindexDir, a, true);
        final int numDocs = searcher.getIndexReader().numDocs();

        HashSet deldocs = new HashSet();
        HashMap infos = new HashMap();
        for (int i = 0; i < numDocs; i++) {
            Document doc = searcher.doc(i);
            String delprop = doc.get(DeletedInfos.DELETED);
            String layerStr = doc.get(LuceneManager.LAYER_OF_SAVE);
            int layer = -1;
            try {
                layer = Integer.parseInt(layerStr);
            } catch (Exception ex) {
                layer = -1;
            }
            final String id = doc.get(LuceneManager.ID) + DeletedInfos.KEY_SEPERATOR
                    + doc.get(LuceneManager.LAYER_OF_SAVE);
            if (delprop != null && "true".equals(delprop)/* && layer == DbKey.LIVE_LAYER*/) {
                deldocs.add(id);
            } else {
                Object v;
                if ((v = infos.get(id)) == null) {
                    infos.put(id, new Integer(i));
                } else {
                    final String lmod = doc.get(LuceneManager.LASTMODIFIED);
                    final String lmod_prev = searcher.doc(((Integer) v).intValue()).get("_lastmodified");
                    if (lmod_prev == null || (lmod != null && lmod.compareTo(lmod_prev) > 0)) {
                        infos.put(id, new Integer(i));
                    }
                }
            }
        }

        ArrayList listOfMaps = new ArrayList();

        for (int i = 0; i < numDocs; i++) {
            Document doc = searcher.doc(i);
            String delprop = doc.get(DeletedInfos.DELETED);
            String layerStr = doc.get(LuceneManager.LAYER_OF_SAVE);
            int layer = -1;
            try {
                layer = Integer.parseInt(layerStr);
            } catch (Exception ex) {
                layer = -1;
            }
            final String id = doc.get(LuceneManager.ID) + DeletedInfos.KEY_SEPERATOR
                    + doc.get(LuceneManager.LAYER_OF_SAVE);
            if (delprop != null && "true".equals(delprop)) {
                continue;
            } else if (id != null && deldocs.contains(id)/* && layer == DbKey.LIVE_LAYER*/) {
                continue;
            }

            Integer idx = (Integer) infos.get(id);
            if (idx != null && i != idx.intValue()) {
                continue;
            }

            Document ndoc = convertDocument(doc);

            if (this.recordNodes) {
                listOfMaps.add(LuceneManager.luceneDocumentToMap(doc));
            }

            if (ndoc != null) {
                writer.addDocument(ndoc);
            }
        }

        if (this.recordNodes) {
            lmgr = new LuceneManager(this.app, false, true);
            this.allNodes = new HashMap();
            final int size = listOfMaps.size();
            for (int i = 0; i < size; i++) {
                HashMap m = (HashMap) listOfMaps.get(i);
                INode n = lmgr.mapToNode(m);
                this.allNodes.put(n.getID(), getPath(n));
                n = null;
            }
        }

    } catch (Exception ex) {
        ex.printStackTrace();
        throw new RuntimeException(ex);
    } finally {
        if (searcher != null) {
            try {
                searcher.close();
            } catch (Exception ex) {
                app.logError(ErrorReporter.errorMsg(this.getClass(), "convert"), ex);
            }
        }

        if (lmgr != null) {
            lmgr.shutdown();
            lmgr = null;
        }

        indexDir.close();
        SegmentInfos sinfos = IndexObjectsFactory.getFSSegmentInfos(indexDir);
        sinfos.clear();
        IndexObjectsFactory.removeDeletedInfos(indexDir);
    }

    Connection conn = null;
    boolean exceptionOccured = false;

    try {
        if (writer != null) {
            TransSource ts = app.getTransSource();
            conn = ts.getConnection();

            DatabaseMetaData dmd = conn.getMetaData();
            ResultSet rs = dmd.getColumns(null, null, "Lucene", "version");
            if (!rs.next()) {
                final String alterTbl = "ALTER TABLE Lucene ADD version INT NOT NULL DEFAULT 1";
                PreparedStatement pstmt = null;
                try {
                    pstmt = conn.prepareStatement(alterTbl);
                    pstmt.execute();
                } catch (SQLException sqle) {
                    app.logError(ErrorReporter.errorMsg(this.getClass(), "convert"), sqle);
                } finally {
                    if (pstmt != null) {
                        pstmt.close();
                        pstmt = null;
                    }
                }
            }
            rs.close();
            rs = null;

            writer.close();
            writer.flushCache();//TODO:writer.writeSegmentsFile();
            LuceneManager.commitSegments(conn, app, writer.getDirectory());
            writer.finalizeTrans();

            this.updateSQL(conn);
        }
    } catch (Exception ex) {
        ex.printStackTrace();
        exceptionOccured = true;
        throw new RuntimeException(ex);
    } finally {
        if (conn != null) {
            try {
                if (!conn.getAutoCommit()) {
                    if (!exceptionOccured) {
                        conn.commit();
                    } else {
                        conn.rollback();
                    }
                }
                conn.close();
            } catch (Exception ex) {
                app.logError(ErrorReporter.errorMsg(this.getClass(), "convert"), ex);
            }
            conn = null;
        }

        nindexDir.close();
        SegmentInfos sinfos = IndexObjectsFactory.getFSSegmentInfos(nindexDir);
        sinfos.clear();
        IndexObjectsFactory.removeDeletedInfos(nindexDir);
    }

    if (!dbhome.renameTo(olddbhome)) {
        throw new Exception("Could not move the old version of the db into " + olddbhome);
    }

    if (!ndbhome.renameTo(dbhome)) {
        throw new Exception("Could not move the newer version of the db into " + dbhome);
    }

    File oldBlobDir = new File(olddbhome, "blob");
    File newBlobDir = new File(ndbhome, "blob");
    oldBlobDir.renameTo(newBlobDir);

    if (!FileUtils.deleteDir(olddbhome)) {
        throw new Exception("Could not delete the old version of the db at " + olddbhome);
    }
}

From source file:com.berico.clavin.index.IndexDirectoryBuilder.java

License:Apache License

/**
 * Turns a GeoNames gazetteer file into a Lucene index, and adds
 * some supplementary gazetteer records at the end.
 * //  www. ja v a2  s . c  o m
 * @param args            not used
 * @throws IOException
 */
public static void main(String[] args) throws IOException {

    logger.info("Indexing... please wait.");

    // Create a new index file on disk, allowing Lucene to choose
    // the best FSDirectory implementation given the environment.
    // TODO: delete this directory first, if it exists
    FSDirectory index = FSDirectory.open(new File("./IndexDirectory"));

    // indexing by lower-casing & tokenizing on whitespace
    Analyzer indexAnalyzer = new WhitespaceLowerCaseAnalyzer();

    // create the object that will actually build the Lucene index
    IndexWriter indexWriter = new IndexWriter(index, new IndexWriterConfig(Version.LUCENE_40, indexAnalyzer));

    // open the gazetteer files to be loaded
    BufferedReader r = new BufferedReader(
            new InputStreamReader(new FileInputStream(new File(pathToGazetteer)), "UTF-8"));
    BufferedReader r2 = new BufferedReader(new InputStreamReader(
            new FileInputStream(new File("./src/main/resources/SupplementaryGazetteer.txt")), "UTF-8"));

    String line;

    // let's see how long this takes...
    Date start = new Date();

    // load GeoNames gazetteer into Lucene index
    while ((line = r.readLine()) != null)
        addToIndex(indexWriter, line);

    // add supplementary gazetteer records to index
    while ((line = r2.readLine()) != null)
        addToIndex(indexWriter, line);

    // that wasn't so long, was it?
    Date stop = new Date();

    logger.info("[DONE]");
    logger.info(indexWriter.maxDoc() + " geonames added to index.");
    logger.info("Merging indices... please wait.");

    indexWriter.close();
    index.close();
    r.close();
    r2.close();

    logger.info("[DONE]");

    DateFormat df = new SimpleDateFormat("HH:mm:ss");
    long elapsed_MILLIS = stop.getTime() - start.getTime();
    logger.info("Process started: " + df.format(start) + ", ended: " + df.format(stop) + "; elapsed time: "
            + MILLISECONDS.toSeconds(elapsed_MILLIS) + " seconds.");
}

From source file:com.bericotech.clavin.index.IndexDirectoryBuilder.java

License:Apache License

public void buildIndex(final File indexDir, final List<File> gazetteerFiles, final File altNamesFile)
        throws IOException {
    LOG.info("Indexing... please wait.");

    indexCount = 0;//w  ww . j  av a2s .c o m

    // Create a new index file on disk, allowing Lucene to choose
    // the best FSDirectory implementation given the environment.
    FSDirectory index = FSDirectory.open(indexDir);

    // indexing by lower-casing & tokenizing on whitespace
    Analyzer indexAnalyzer = new WhitespaceLowerCaseAnalyzer();

    // create the object that will actually build the Lucene index
    indexWriter = new IndexWriter(index, new IndexWriterConfig(Version.LUCENE_4_9, indexAnalyzer));

    // let's see how long this takes...
    Date start = new Date();

    // if we were given an alternate names file, process it
    if (altNamesFile != null) {
        loadAlternateNames(altNamesFile);
    }

    // load GeoNames gazetteer into Lucene index
    String line;
    int count = 0;
    for (File gazetteer : gazetteerFiles) {
        LOG.info("Processing Gazetteer: {}", gazetteer.getAbsolutePath());
        BufferedReader reader = new BufferedReader(
                new InputStreamReader(new FileInputStream(gazetteer), "UTF-8"));
        while ((line = reader.readLine()) != null) {
            try {
                count += 1;
                // print progress update to console
                if (count % 100000 == 0) {
                    LOG.info("rowcount: " + count);
                }
                GeoName geoName = BasicGeoName.parseFromGeoNamesRecord(line);
                resolveAncestry(geoName);
            } catch (IOException e) {
                LOG.info("Skipping... Error on line: {}", line);
            } catch (RuntimeException re) {
                LOG.info("Skipping... Error on line: {}", line);
            }
        }
        reader.close();
    }

    // that wasn't so long, was it?
    Date stop = new Date();

    LOG.info("Unresolved GeoNames (Pre-resolution)");
    logUnresolved();

    resolveUnresolved();

    LOG.info("Unresolved GeoNames (Post-resolution)");
    logUnresolved();

    LOG.info("Indexing unresolved GeoNames.");
    for (Set<GeoName> geos : unresolvedMap.values()) {
        for (GeoName nm : geos) {
            indexGeoName(nm);
        }
    }

    LOG.info("[DONE]");
    LOG.info("{} geonames added to index. ({} records)", indexWriter.maxDoc(), indexCount);
    LOG.info("Merging indices... please wait.");

    indexWriter.close();
    index.close();

    LOG.info("[DONE]");

    DateFormat df = new SimpleDateFormat("HH:mm:ss");
    long elapsed_MILLIS = stop.getTime() - start.getTime();
    LOG.info("Process started: " + df.format(start) + ", ended: " + df.format(stop) + "; elapsed time: "
            + MILLISECONDS.toSeconds(elapsed_MILLIS) + " seconds.");
}

From source file:com.github.tteofili.looseen.Test20NewsgroupsClassification.java

License:Apache License

@Test
public void test20Newsgroups() throws Exception {

    String indexProperty = System.getProperty("index");
    if (indexProperty != null) {
        try {// www  . jav  a2s  . c om
            index = Boolean.valueOf(indexProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    String splitProperty = System.getProperty("split");
    if (splitProperty != null) {
        try {
            split = Boolean.valueOf(splitProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    Path mainIndexPath = Paths.get(INDEX + "/original");
    Directory directory = FSDirectory.open(mainIndexPath);
    Path trainPath = Paths.get(INDEX + "/train");
    Path testPath = Paths.get(INDEX + "/test");
    Path cvPath = Paths.get(INDEX + "/cv");
    FSDirectory cv = null;
    FSDirectory test = null;
    FSDirectory train = null;
    IndexReader testReader = null;
    if (split) {
        cv = FSDirectory.open(cvPath);
        test = FSDirectory.open(testPath);
        train = FSDirectory.open(trainPath);
    }

    if (index) {
        delete(mainIndexPath);
        if (split) {
            delete(trainPath, testPath, cvPath);
        }
    }

    IndexReader reader = null;
    List<Classifier<BytesRef>> classifiers = new LinkedList<>();
    try {
        Analyzer analyzer = new StandardAnalyzer();
        if (index) {

            System.out.format("Indexing 20 Newsgroups...%n");

            long startIndex = System.currentTimeMillis();
            IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer));

            buildIndex(new File(PREFIX + "/20n/20_newsgroups"), indexWriter);

            long endIndex = System.currentTimeMillis();
            System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(),
                    (endIndex - startIndex) / 1000);

            indexWriter.close();

        }

        if (split && !index) {
            reader = DirectoryReader.open(train);
        } else {
            reader = DirectoryReader.open(directory);
        }

        if (index && split) {
            // split the index
            System.out.format("Splitting the index...%n");

            long startSplit = System.currentTimeMillis();
            DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0);
            datasetSplitter.split(reader, train, test, cv, analyzer, false, CATEGORY_FIELD, BODY_FIELD,
                    SUBJECT_FIELD, CATEGORY_FIELD);
            reader.close();
            reader = DirectoryReader.open(train); // using the train index from now on
            long endSplit = System.currentTimeMillis();
            System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000);
        }

        final long startTime = System.currentTimeMillis();

        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD,
                BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new AxiomaticF1EXP(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new AxiomaticF1LOG(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3,
                1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer,
                null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 3, 1, 1, CATEGORY_FIELD,
                BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()),
                analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 15, 1, 100));
        classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 30, 3, 300));
        classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 10, 1, 100));
        classifiers.add(new KNearestFuzzyClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer, null,
                1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader,
                new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null,
                1, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers
                .add(new KNearestFuzzyClassifier(reader, null, analyzer, null, 1, CATEGORY_FIELD, BODY_FIELD));
        classifiers
                .add(new KNearestFuzzyClassifier(reader, null, analyzer, null, 3, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new AxiomaticF1EXP(), analyzer, null, 3,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new AxiomaticF1LOG(), analyzer, null, 3,
                CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD));
        classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD));

        int maxdoc;

        if (split) {
            testReader = DirectoryReader.open(test);
            maxdoc = testReader.maxDoc();
        } else {
            maxdoc = reader.maxDoc();
        }

        System.out.format("Starting evaluation on %d docs...%n", maxdoc);

        ExecutorService service = Executors.newCachedThreadPool();
        List<Future<String>> futures = new LinkedList<>();
        for (Classifier<BytesRef> classifier : classifiers) {
            testClassifier(reader, startTime, testReader, service, futures, classifier);
        }
        for (Future<String> f : futures) {
            System.out.println(f.get());
        }

        Thread.sleep(10000);
        service.shutdown();

    } finally {
        if (reader != null) {
            reader.close();
        }
        directory.close();
        if (test != null) {
            test.close();
        }
        if (train != null) {
            train.close();
        }
        if (cv != null) {
            cv.close();
        }
        if (testReader != null) {
            testReader.close();
        }

        for (Classifier c : classifiers) {
            if (c instanceof Closeable) {
                ((Closeable) c).close();
            }
        }
    }
}

From source file:com.github.tteofili.looseen.TestWikipediaClassification.java

License:Apache License

@Test
public void testItalianWikipedia() throws Exception {

    String indexProperty = System.getProperty("index");
    if (indexProperty != null) {
        try {//from w  w w  . j a  va  2 s . co m
            index = Boolean.valueOf(indexProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    String splitProperty = System.getProperty("split");
    if (splitProperty != null) {
        try {
            split = Boolean.valueOf(splitProperty);
        } catch (Exception e) {
            // ignore
        }
    }

    Path mainIndexPath = Paths.get(INDEX + "/original");
    Directory directory = FSDirectory.open(mainIndexPath);
    Path trainPath = Paths.get(INDEX + "/train");
    Path testPath = Paths.get(INDEX + "/test");
    Path cvPath = Paths.get(INDEX + "/cv");
    FSDirectory cv = null;
    FSDirectory test = null;
    FSDirectory train = null;
    DirectoryReader testReader = null;
    if (split) {
        cv = FSDirectory.open(cvPath);
        test = FSDirectory.open(testPath);
        train = FSDirectory.open(trainPath);
    }

    if (index) {
        delete(mainIndexPath);
        if (split) {
            delete(trainPath, testPath, cvPath);
        }
    }

    IndexReader reader = null;
    try {
        Collection<String> stopWordsList = Arrays.asList("di", "a", "da", "in", "per", "tra", "fra", "il", "lo",
                "la", "i", "gli", "le");
        CharArraySet stopWords = new CharArraySet(stopWordsList, true);
        Analyzer analyzer = new ItalianAnalyzer(stopWords);
        if (index) {

            System.out.format("Indexing Italian Wikipedia...%n");

            long startIndex = System.currentTimeMillis();
            IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer));

            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current1.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current2.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current3.xml"), indexWriter);
            importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current4.xml"), indexWriter);

            long endIndex = System.currentTimeMillis();
            System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(),
                    (endIndex - startIndex) / 1000);

            indexWriter.close();

        }

        if (split && !index) {
            reader = DirectoryReader.open(train);
        } else {
            reader = DirectoryReader.open(directory);
        }

        if (index && split) {
            // split the index
            System.out.format("Splitting the index...%n");

            long startSplit = System.currentTimeMillis();
            DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0);
            for (LeafReaderContext context : reader.leaves()) {
                datasetSplitter.split(context.reader(), train, test, cv, analyzer, false, CATEGORY_FIELD,
                        TEXT_FIELD, CATEGORY_FIELD);
            }
            reader.close();
            reader = DirectoryReader.open(train); // using the train index from now on
            long endSplit = System.currentTimeMillis();
            System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000);
        }

        final long startTime = System.currentTimeMillis();

        List<Classifier<BytesRef>> classifiers = new LinkedList<>();
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new BM25Similarity(), analyzer, null, 1, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD,
                TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3,
                1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer,
                null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 1, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()),
                analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestNeighborClassifier(reader,
                new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null,
                3, 1, 1, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 10, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 1, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 100));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 300));
        classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 3, 100));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 3,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 1,
                CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));
        classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD));

        int maxdoc;

        if (split) {
            testReader = DirectoryReader.open(test);
            maxdoc = testReader.maxDoc();
        } else {
            maxdoc = reader.maxDoc();
        }

        System.out.format("Starting evaluation on %d docs...%n", maxdoc);

        ExecutorService service = Executors.newCachedThreadPool();
        List<Future<String>> futures = new LinkedList<>();
        for (Classifier<BytesRef> classifier : classifiers) {

            final IndexReader finalReader = reader;
            final DirectoryReader finalTestReader = testReader;
            futures.add(service.submit(() -> {
                ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix;
                if (split) {
                    confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalTestReader, classifier,
                            CATEGORY_FIELD, TEXT_FIELD, 60000 * 30);
                } else {
                    confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalReader, classifier,
                            CATEGORY_FIELD, TEXT_FIELD, 60000 * 30);
                }

                final long endTime = System.currentTimeMillis();
                final int elapse = (int) (endTime - startTime) / 1000;

                return " * " + classifier + " \n    * accuracy = " + confusionMatrix.getAccuracy()
                        + "\n    * precision = " + confusionMatrix.getPrecision() + "\n    * recall = "
                        + confusionMatrix.getRecall() + "\n    * f1-measure = " + confusionMatrix.getF1Measure()
                        + "\n    * avgClassificationTime = " + confusionMatrix.getAvgClassificationTime()
                        + "\n    * time = " + elapse + " (sec)\n ";
            }));

        }
        for (Future<String> f : futures) {
            System.out.println(f.get());
        }

        Thread.sleep(10000);
        service.shutdown();

    } finally {
        try {
            if (reader != null) {
                reader.close();
            }
            if (directory != null) {
                directory.close();
            }
            if (test != null) {
                test.close();
            }
            if (train != null) {
                train.close();
            }
            if (cv != null) {
                cv.close();
            }
            if (testReader != null) {
                testReader.close();
            }
        } catch (Throwable e) {
            e.printStackTrace();
        }
    }
}

From source file:com.globalsight.ling.tm2.lucene.LuceneIndexWriter.java

License:Apache License

/**
 * Removes all index for a Tm. This method removes all index files
 * and directories that belong to the specified Tm. This method
 * tries to remove Tm in a destructive fashion, meaning that, even
 * if the locks on the indexes have been obtained by other
 * process/thread, this method goes ahead and delete indexes. The
 * other process/thread with locks will get exceptions.
 *
 * @param p_tmId Tm id//from w ww.  j a  v  a2s .co m
 */
static public void removeTm(long p_tmId) throws Exception {
    File tmIndexRoot = LuceneUtil.getGoldTmIndexParentDir(p_tmId);

    String[] localeNames = tmIndexRoot.list();

    // localeNames can be null if no index has been created.
    if (localeNames != null) {
        for (int i = 0; i < localeNames.length; i++) {
            File directory = new File(tmIndexRoot, localeNames[i]);

            try {
                if (!directory.isDirectory()) {
                    FileUtil.deleteFile(directory);
                    continue;
                }

                // delete all index files and Lucene lock files
                FSDirectory fsDirectory = FSDirectory.open(directory);

                // delete our lock file
                Lock lock = fsDirectory.makeLock(LOCK_NAME);
                lock.release();

                fsDirectory.close();

                // delete index directory 
                FileUtil.deleteFile(directory);

                // clean cache if have
                LuceneCache.cleanLuceneCache(directory);
            } catch (Exception e) {
                c_logger.error("Exception is thrown while removing a Tm index. Ignoring...", e);
            }
        }
    }

    // delete index root directory
    tmIndexRoot.delete();
}

From source file:com.leavesfly.lia.tool.BooksMoreLikeThis.java

License:Apache License

public static void main(String[] args) throws Throwable {

    String indexDir = System.getProperty("index.dir");
    FSDirectory directory = FSDirectory.open(new File(indexDir));
    IndexReader reader = IndexReader.open(directory);

    IndexSearcher searcher = new IndexSearcher(reader);

    int numDocs = reader.maxDoc();

    MoreLikeThis mlt = new MoreLikeThis(reader); // #A
    mlt.setFieldNames(new String[] { "title", "author" });
    mlt.setMinTermFreq(1); // #B
    mlt.setMinDocFreq(1);//from   ww  w. j ava 2  s.com

    for (int docID = 0; docID < numDocs; docID++) { // #C
        System.out.println();
        Document doc = reader.document(docID);
        System.out.println(doc.get("title"));

        Query query = mlt.like(docID); // #D
        System.out.println("  query=" + query);

        TopDocs similarDocs = searcher.search(query, 10);
        if (similarDocs.totalHits == 0)
            System.out.println("  None like this");
        for (int i = 0; i < similarDocs.scoreDocs.length; i++) {
            if (similarDocs.scoreDocs[i].doc != docID) { // #E
                doc = reader.document(similarDocs.scoreDocs[i].doc);
                System.out.println("  -> " + doc.getField("title").stringValue());
            }
        }
    }

    searcher.close();
    reader.close();
    directory.close();
}

From source file:com.mathworks.xzheng.tools.BooksMoreLikeThis.java

License:Apache License

public static void main(String[] args) throws Throwable {

    String indexDir = System.getProperty("index.dir");
    FSDirectory directory = FSDirectory.open(new File(indexDir));
    IndexReader reader = IndexReader.open(directory);

    IndexSearcher searcher = new IndexSearcher(reader);

    int numDocs = reader.maxDoc();

    MoreLikeThis mlt = new MoreLikeThis(reader); // #A
    mlt.setFieldNames(new String[] { "title", "author" });
    mlt.setMinTermFreq(1); // #B
    mlt.setMinDocFreq(1);/*from   www  . j  a v  a  2  s  . com*/

    for (int docID = 0; docID < numDocs; docID++) { // #C
        System.out.println();
        Document doc = reader.document(docID);
        System.out.println(doc.get("title"));

        Query query = mlt.like(docID); // #D
        System.out.println("  query=" + query);

        TopDocs similarDocs = searcher.search(query, 10);
        if (similarDocs.totalHits == 0)
            System.out.println("  None like this");
        for (int i = 0; i < similarDocs.scoreDocs.length; i++) {
            if (similarDocs.scoreDocs[i].doc != docID) { // #E
                doc = reader.document(similarDocs.scoreDocs[i].doc);
                System.out.println("  -> " + doc.getField("title").stringValue());
            }
        }
    }

    reader.close();
    directory.close();
}