List of usage examples for org.apache.lucene.store FSDirectory close
@Override
public synchronized void close() throws IOException
From source file:aos.lucene.tools.BooksMoreLikeThis.java
License:Apache License
public static void main(String[] args) throws Throwable { String indexDir = System.getProperty("index.dir"); FSDirectory directory = FSDirectory.open(new File(indexDir)); IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); int numDocs = reader.maxDoc(); MoreLikeThis mlt = new MoreLikeThis(reader); mlt.setFieldNames(new String[] { "title", "author" }); mlt.setMinTermFreq(1);//w w w . ja v a2 s .c om mlt.setMinDocFreq(1); for (int docID = 0; docID < numDocs; docID++) { LOGGER.info(); Document doc = reader.document(docID); LOGGER.info(doc.get("title")); Query query = mlt.like(docID); LOGGER.info(" query=" + query); TopDocs similarDocs = searcher.search(query, 10); if (similarDocs.totalHits == 0) LOGGER.info(" None like this"); for (int i = 0; i < similarDocs.scoreDocs.length; i++) { if (similarDocs.scoreDocs[i].doc != docID) { doc = reader.document(similarDocs.scoreDocs[i].doc); LOGGER.info(" -> " + doc.getField("title").stringValue()); } } } reader.close(); directory.close(); }
From source file:axiom.db.utils.LuceneManipulator.java
License:Open Source License
public void compress(String dbDir) throws Exception { System.setProperty("org.apache.lucene.FSDirectory.class", "org.apache.lucene.store.TransFSDirectory"); File dbhome = new File(dbDir); String url = getUrl(dbhome);/* w ww .j a v a 2 s .c o m*/ FSDirectory indexDir = FSDirectory.getDirectory(dbhome, false); if (indexDir instanceof TransFSDirectory) { FSDirectory.setDisableLocks(true); TransFSDirectory d = (TransFSDirectory) indexDir; d.setDriverClass(DRIVER_CLASS); d.setUrl(url); d.setUser(null); d.setPassword(null); } File ndbhome = new File(dbhome.getParentFile(), dbhome.getName() + "_tmp"); File olddbhome = new File(dbhome.getParentFile(), dbhome.getName() + "_old"); FSDirectory nindexDir = FSDirectory.getDirectory(ndbhome, true); if (nindexDir instanceof TransFSDirectory) { FSDirectory.setDisableLocks(true); TransFSDirectory d = (TransFSDirectory) nindexDir; d.setDriverClass(DRIVER_CLASS); d.setUrl(url); d.setUser(null); d.setPassword(null); } IndexSearcher searcher = null; IndexWriter writer = null; LuceneManager lmgr = null; try { searcher = new IndexSearcher(indexDir); PerFieldAnalyzerWrapper a = LuceneManager.buildAnalyzer(); writer = IndexWriterManager.getWriter(nindexDir, a, true); final int numDocs = searcher.getIndexReader().numDocs(); HashSet deldocs = new HashSet(); HashMap infos = new HashMap(); for (int i = 0; i < numDocs; i++) { Document doc = searcher.doc(i); String delprop = doc.get(DeletedInfos.DELETED); final String id = doc.get(LuceneManager.ID) + DeletedInfos.KEY_SEPERATOR + doc.get(LuceneManager.LAYER_OF_SAVE); if (delprop != null && "true".equals(delprop)) { deldocs.add(id); } else { Object v; if ((v = infos.get(id)) == null) { infos.put(id, new Integer(i)); } else { final String lmod = doc.get(LuceneManager.LASTMODIFIED); final String lmod_prev = searcher.doc(((Integer) v).intValue()).get("_lastmodified"); if (lmod_prev == null || (lmod != null && lmod.compareTo(lmod_prev) > 0)) { infos.put(id, new Integer(i)); } } } } ArrayList listOfMaps = new ArrayList(); for (int i = 0; i < numDocs; i++) { Document doc = searcher.doc(i); String delprop = doc.get(DeletedInfos.DELETED); String layerStr = doc.get(LuceneManager.LAYER_OF_SAVE); int layer = -1; try { layer = Integer.parseInt(layerStr); } catch (Exception ex) { layer = -1; } final String id = doc.get(LuceneManager.ID) + DeletedInfos.KEY_SEPERATOR + doc.get(LuceneManager.LAYER_OF_SAVE); if (delprop != null && "true".equals(delprop)) { continue; } else if (id != null && deldocs.contains(id)) { continue; } Integer idx = (Integer) infos.get(id); if (idx != null && i != idx.intValue()) { continue; } Document ndoc = convertDocument(doc); if (ndoc != null) { writer.addDocument(ndoc); } } } catch (Exception ex) { ex.printStackTrace(); throw new RuntimeException(ex); } finally { if (searcher != null) { try { searcher.close(); } catch (Exception ex) { } } if (lmgr != null) { lmgr.shutdown(); lmgr = null; } indexDir.close(); SegmentInfos sinfos = IndexObjectsFactory.getFSSegmentInfos(indexDir); sinfos.clear(); IndexObjectsFactory.removeDeletedInfos(indexDir); } Connection conn = null; boolean exceptionOccured = false; try { if (writer != null) { conn = DriverManager.getConnection(url); conn.setAutoCommit(false); writer.close(); writer.flushCache(); LuceneManager.commitSegments(null, conn, dbhome, writer.getDirectory()); writer.finalizeTrans(); } } catch (Exception ex) { ex.printStackTrace(); exceptionOccured = true; throw new RuntimeException(ex); } finally { if (conn != null) { try { if (!conn.getAutoCommit()) { if (!exceptionOccured) { conn.commit(); } else { conn.rollback(); } } conn.close(); } catch (Exception ex) { ex.printStackTrace(); } conn = null; } nindexDir.close(); SegmentInfos sinfos = IndexObjectsFactory.getFSSegmentInfos(nindexDir); sinfos.clear(); IndexObjectsFactory.removeDeletedInfos(nindexDir); } File[] files = dbhome.listFiles(); for (int i = 0; i < files.length; i++) { if (!files[i].isDirectory()) { files[i].delete(); } } files = ndbhome.listFiles(); for (int i = 0; i < files.length; i++) { if (!files[i].isDirectory()) { File nfile = new File(dbhome, files[i].getName()); files[i].renameTo(nfile); } } if (!FileUtils.deleteDir(ndbhome)) { throw new Exception("Could not delete " + ndbhome); } }
From source file:axiom.objectmodel.dom.convert.LuceneConvertor.java
License:Open Source License
public void convert(Application app, File dbhome) throws Exception { FSDirectory indexDir = FSDirectory.getDirectory(dbhome, false); if (indexDir instanceof TransFSDirectory) { FSDirectory.setDisableLocks(true); TransFSDirectory d = (TransFSDirectory) indexDir; TransSource source = app.getTransSource(); d.setDriverClass(source.getDriverClass()); d.setUrl(source.getUrl());/* w w w . jav a 2 s. c o m*/ d.setUser(source.getUser()); d.setPassword(source.getPassword()); } File ndbhome = new File(dbhome.getParentFile(), dbhome.getName() + "_tmp"); File olddbhome = new File(dbhome.getParentFile(), dbhome.getName() + "_old"); FSDirectory nindexDir = FSDirectory.getDirectory(ndbhome, true); if (nindexDir instanceof TransFSDirectory) { FSDirectory.setDisableLocks(true); TransFSDirectory d = (TransFSDirectory) nindexDir; TransSource source = app.getTransSource(); d.setDriverClass(source.getDriverClass()); d.setUrl(source.getUrl()); d.setUser(source.getUser()); d.setPassword(source.getPassword()); } IndexSearcher searcher = null; IndexWriter writer = null; LuceneManager lmgr = null; try { searcher = new IndexSearcher(indexDir); PerFieldAnalyzerWrapper a = LuceneManager.buildAnalyzer(); writer = IndexWriterManager.getWriter(nindexDir, a, true); final int numDocs = searcher.getIndexReader().numDocs(); HashSet deldocs = new HashSet(); HashMap infos = new HashMap(); for (int i = 0; i < numDocs; i++) { Document doc = searcher.doc(i); String delprop = doc.get(DeletedInfos.DELETED); String layerStr = doc.get(LuceneManager.LAYER_OF_SAVE); int layer = -1; try { layer = Integer.parseInt(layerStr); } catch (Exception ex) { layer = -1; } final String id = doc.get(LuceneManager.ID) + DeletedInfos.KEY_SEPERATOR + doc.get(LuceneManager.LAYER_OF_SAVE); if (delprop != null && "true".equals(delprop)/* && layer == DbKey.LIVE_LAYER*/) { deldocs.add(id); } else { Object v; if ((v = infos.get(id)) == null) { infos.put(id, new Integer(i)); } else { final String lmod = doc.get(LuceneManager.LASTMODIFIED); final String lmod_prev = searcher.doc(((Integer) v).intValue()).get("_lastmodified"); if (lmod_prev == null || (lmod != null && lmod.compareTo(lmod_prev) > 0)) { infos.put(id, new Integer(i)); } } } } ArrayList listOfMaps = new ArrayList(); for (int i = 0; i < numDocs; i++) { Document doc = searcher.doc(i); String delprop = doc.get(DeletedInfos.DELETED); String layerStr = doc.get(LuceneManager.LAYER_OF_SAVE); int layer = -1; try { layer = Integer.parseInt(layerStr); } catch (Exception ex) { layer = -1; } final String id = doc.get(LuceneManager.ID) + DeletedInfos.KEY_SEPERATOR + doc.get(LuceneManager.LAYER_OF_SAVE); if (delprop != null && "true".equals(delprop)) { continue; } else if (id != null && deldocs.contains(id)/* && layer == DbKey.LIVE_LAYER*/) { continue; } Integer idx = (Integer) infos.get(id); if (idx != null && i != idx.intValue()) { continue; } Document ndoc = convertDocument(doc); if (this.recordNodes) { listOfMaps.add(LuceneManager.luceneDocumentToMap(doc)); } if (ndoc != null) { writer.addDocument(ndoc); } } if (this.recordNodes) { lmgr = new LuceneManager(this.app, false, true); this.allNodes = new HashMap(); final int size = listOfMaps.size(); for (int i = 0; i < size; i++) { HashMap m = (HashMap) listOfMaps.get(i); INode n = lmgr.mapToNode(m); this.allNodes.put(n.getID(), getPath(n)); n = null; } } } catch (Exception ex) { ex.printStackTrace(); throw new RuntimeException(ex); } finally { if (searcher != null) { try { searcher.close(); } catch (Exception ex) { app.logError(ErrorReporter.errorMsg(this.getClass(), "convert"), ex); } } if (lmgr != null) { lmgr.shutdown(); lmgr = null; } indexDir.close(); SegmentInfos sinfos = IndexObjectsFactory.getFSSegmentInfos(indexDir); sinfos.clear(); IndexObjectsFactory.removeDeletedInfos(indexDir); } Connection conn = null; boolean exceptionOccured = false; try { if (writer != null) { TransSource ts = app.getTransSource(); conn = ts.getConnection(); DatabaseMetaData dmd = conn.getMetaData(); ResultSet rs = dmd.getColumns(null, null, "Lucene", "version"); if (!rs.next()) { final String alterTbl = "ALTER TABLE Lucene ADD version INT NOT NULL DEFAULT 1"; PreparedStatement pstmt = null; try { pstmt = conn.prepareStatement(alterTbl); pstmt.execute(); } catch (SQLException sqle) { app.logError(ErrorReporter.errorMsg(this.getClass(), "convert"), sqle); } finally { if (pstmt != null) { pstmt.close(); pstmt = null; } } } rs.close(); rs = null; writer.close(); writer.flushCache();//TODO:writer.writeSegmentsFile(); LuceneManager.commitSegments(conn, app, writer.getDirectory()); writer.finalizeTrans(); this.updateSQL(conn); } } catch (Exception ex) { ex.printStackTrace(); exceptionOccured = true; throw new RuntimeException(ex); } finally { if (conn != null) { try { if (!conn.getAutoCommit()) { if (!exceptionOccured) { conn.commit(); } else { conn.rollback(); } } conn.close(); } catch (Exception ex) { app.logError(ErrorReporter.errorMsg(this.getClass(), "convert"), ex); } conn = null; } nindexDir.close(); SegmentInfos sinfos = IndexObjectsFactory.getFSSegmentInfos(nindexDir); sinfos.clear(); IndexObjectsFactory.removeDeletedInfos(nindexDir); } if (!dbhome.renameTo(olddbhome)) { throw new Exception("Could not move the old version of the db into " + olddbhome); } if (!ndbhome.renameTo(dbhome)) { throw new Exception("Could not move the newer version of the db into " + dbhome); } File oldBlobDir = new File(olddbhome, "blob"); File newBlobDir = new File(ndbhome, "blob"); oldBlobDir.renameTo(newBlobDir); if (!FileUtils.deleteDir(olddbhome)) { throw new Exception("Could not delete the old version of the db at " + olddbhome); } }
From source file:com.berico.clavin.index.IndexDirectoryBuilder.java
License:Apache License
/** * Turns a GeoNames gazetteer file into a Lucene index, and adds * some supplementary gazetteer records at the end. * // www. ja v a2 s . c o m * @param args not used * @throws IOException */ public static void main(String[] args) throws IOException { logger.info("Indexing... please wait."); // Create a new index file on disk, allowing Lucene to choose // the best FSDirectory implementation given the environment. // TODO: delete this directory first, if it exists FSDirectory index = FSDirectory.open(new File("./IndexDirectory")); // indexing by lower-casing & tokenizing on whitespace Analyzer indexAnalyzer = new WhitespaceLowerCaseAnalyzer(); // create the object that will actually build the Lucene index IndexWriter indexWriter = new IndexWriter(index, new IndexWriterConfig(Version.LUCENE_40, indexAnalyzer)); // open the gazetteer files to be loaded BufferedReader r = new BufferedReader( new InputStreamReader(new FileInputStream(new File(pathToGazetteer)), "UTF-8")); BufferedReader r2 = new BufferedReader(new InputStreamReader( new FileInputStream(new File("./src/main/resources/SupplementaryGazetteer.txt")), "UTF-8")); String line; // let's see how long this takes... Date start = new Date(); // load GeoNames gazetteer into Lucene index while ((line = r.readLine()) != null) addToIndex(indexWriter, line); // add supplementary gazetteer records to index while ((line = r2.readLine()) != null) addToIndex(indexWriter, line); // that wasn't so long, was it? Date stop = new Date(); logger.info("[DONE]"); logger.info(indexWriter.maxDoc() + " geonames added to index."); logger.info("Merging indices... please wait."); indexWriter.close(); index.close(); r.close(); r2.close(); logger.info("[DONE]"); DateFormat df = new SimpleDateFormat("HH:mm:ss"); long elapsed_MILLIS = stop.getTime() - start.getTime(); logger.info("Process started: " + df.format(start) + ", ended: " + df.format(stop) + "; elapsed time: " + MILLISECONDS.toSeconds(elapsed_MILLIS) + " seconds."); }
From source file:com.bericotech.clavin.index.IndexDirectoryBuilder.java
License:Apache License
public void buildIndex(final File indexDir, final List<File> gazetteerFiles, final File altNamesFile) throws IOException { LOG.info("Indexing... please wait."); indexCount = 0;//w ww . j av a2s .c o m // Create a new index file on disk, allowing Lucene to choose // the best FSDirectory implementation given the environment. FSDirectory index = FSDirectory.open(indexDir); // indexing by lower-casing & tokenizing on whitespace Analyzer indexAnalyzer = new WhitespaceLowerCaseAnalyzer(); // create the object that will actually build the Lucene index indexWriter = new IndexWriter(index, new IndexWriterConfig(Version.LUCENE_4_9, indexAnalyzer)); // let's see how long this takes... Date start = new Date(); // if we were given an alternate names file, process it if (altNamesFile != null) { loadAlternateNames(altNamesFile); } // load GeoNames gazetteer into Lucene index String line; int count = 0; for (File gazetteer : gazetteerFiles) { LOG.info("Processing Gazetteer: {}", gazetteer.getAbsolutePath()); BufferedReader reader = new BufferedReader( new InputStreamReader(new FileInputStream(gazetteer), "UTF-8")); while ((line = reader.readLine()) != null) { try { count += 1; // print progress update to console if (count % 100000 == 0) { LOG.info("rowcount: " + count); } GeoName geoName = BasicGeoName.parseFromGeoNamesRecord(line); resolveAncestry(geoName); } catch (IOException e) { LOG.info("Skipping... Error on line: {}", line); } catch (RuntimeException re) { LOG.info("Skipping... Error on line: {}", line); } } reader.close(); } // that wasn't so long, was it? Date stop = new Date(); LOG.info("Unresolved GeoNames (Pre-resolution)"); logUnresolved(); resolveUnresolved(); LOG.info("Unresolved GeoNames (Post-resolution)"); logUnresolved(); LOG.info("Indexing unresolved GeoNames."); for (Set<GeoName> geos : unresolvedMap.values()) { for (GeoName nm : geos) { indexGeoName(nm); } } LOG.info("[DONE]"); LOG.info("{} geonames added to index. ({} records)", indexWriter.maxDoc(), indexCount); LOG.info("Merging indices... please wait."); indexWriter.close(); index.close(); LOG.info("[DONE]"); DateFormat df = new SimpleDateFormat("HH:mm:ss"); long elapsed_MILLIS = stop.getTime() - start.getTime(); LOG.info("Process started: " + df.format(start) + ", ended: " + df.format(stop) + "; elapsed time: " + MILLISECONDS.toSeconds(elapsed_MILLIS) + " seconds."); }
From source file:com.github.tteofili.looseen.Test20NewsgroupsClassification.java
License:Apache License
@Test public void test20Newsgroups() throws Exception { String indexProperty = System.getProperty("index"); if (indexProperty != null) { try {// www . jav a2s . c om index = Boolean.valueOf(indexProperty); } catch (Exception e) { // ignore } } String splitProperty = System.getProperty("split"); if (splitProperty != null) { try { split = Boolean.valueOf(splitProperty); } catch (Exception e) { // ignore } } Path mainIndexPath = Paths.get(INDEX + "/original"); Directory directory = FSDirectory.open(mainIndexPath); Path trainPath = Paths.get(INDEX + "/train"); Path testPath = Paths.get(INDEX + "/test"); Path cvPath = Paths.get(INDEX + "/cv"); FSDirectory cv = null; FSDirectory test = null; FSDirectory train = null; IndexReader testReader = null; if (split) { cv = FSDirectory.open(cvPath); test = FSDirectory.open(testPath); train = FSDirectory.open(trainPath); } if (index) { delete(mainIndexPath); if (split) { delete(trainPath, testPath, cvPath); } } IndexReader reader = null; List<Classifier<BytesRef>> classifiers = new LinkedList<>(); try { Analyzer analyzer = new StandardAnalyzer(); if (index) { System.out.format("Indexing 20 Newsgroups...%n"); long startIndex = System.currentTimeMillis(); IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer)); buildIndex(new File(PREFIX + "/20n/20_newsgroups"), indexWriter); long endIndex = System.currentTimeMillis(); System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(), (endIndex - startIndex) / 1000); indexWriter.close(); } if (split && !index) { reader = DirectoryReader.open(train); } else { reader = DirectoryReader.open(directory); } if (index && split) { // split the index System.out.format("Splitting the index...%n"); long startSplit = System.currentTimeMillis(); DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0); datasetSplitter.split(reader, train, test, cv, analyzer, false, CATEGORY_FIELD, BODY_FIELD, SUBJECT_FIELD, CATEGORY_FIELD); reader.close(); reader = DirectoryReader.open(train); // using the train index from now on long endSplit = System.currentTimeMillis(); System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000); } final long startTime = System.currentTimeMillis(); classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new AxiomaticF1EXP(), analyzer, null, 3, 0, 0, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new AxiomaticF1LOG(), analyzer, null, 3, 0, 0, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 15, 1, 100)); classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 30, 3, 300)); classifiers.add(new MinHashClassifier(reader, BODY_FIELD, CATEGORY_FIELD, 10, 1, 100)); classifiers.add(new KNearestFuzzyClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer, null, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, CATEGORY_FIELD, BODY_FIELD)); classifiers .add(new KNearestFuzzyClassifier(reader, null, analyzer, null, 1, CATEGORY_FIELD, BODY_FIELD)); classifiers .add(new KNearestFuzzyClassifier(reader, null, analyzer, null, 3, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new AxiomaticF1EXP(), analyzer, null, 3, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new AxiomaticF1LOG(), analyzer, null, 3, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD)); classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, BODY_FIELD)); int maxdoc; if (split) { testReader = DirectoryReader.open(test); maxdoc = testReader.maxDoc(); } else { maxdoc = reader.maxDoc(); } System.out.format("Starting evaluation on %d docs...%n", maxdoc); ExecutorService service = Executors.newCachedThreadPool(); List<Future<String>> futures = new LinkedList<>(); for (Classifier<BytesRef> classifier : classifiers) { testClassifier(reader, startTime, testReader, service, futures, classifier); } for (Future<String> f : futures) { System.out.println(f.get()); } Thread.sleep(10000); service.shutdown(); } finally { if (reader != null) { reader.close(); } directory.close(); if (test != null) { test.close(); } if (train != null) { train.close(); } if (cv != null) { cv.close(); } if (testReader != null) { testReader.close(); } for (Classifier c : classifiers) { if (c instanceof Closeable) { ((Closeable) c).close(); } } } }
From source file:com.github.tteofili.looseen.TestWikipediaClassification.java
License:Apache License
@Test public void testItalianWikipedia() throws Exception { String indexProperty = System.getProperty("index"); if (indexProperty != null) { try {//from w w w . j a va 2 s . co m index = Boolean.valueOf(indexProperty); } catch (Exception e) { // ignore } } String splitProperty = System.getProperty("split"); if (splitProperty != null) { try { split = Boolean.valueOf(splitProperty); } catch (Exception e) { // ignore } } Path mainIndexPath = Paths.get(INDEX + "/original"); Directory directory = FSDirectory.open(mainIndexPath); Path trainPath = Paths.get(INDEX + "/train"); Path testPath = Paths.get(INDEX + "/test"); Path cvPath = Paths.get(INDEX + "/cv"); FSDirectory cv = null; FSDirectory test = null; FSDirectory train = null; DirectoryReader testReader = null; if (split) { cv = FSDirectory.open(cvPath); test = FSDirectory.open(testPath); train = FSDirectory.open(trainPath); } if (index) { delete(mainIndexPath); if (split) { delete(trainPath, testPath, cvPath); } } IndexReader reader = null; try { Collection<String> stopWordsList = Arrays.asList("di", "a", "da", "in", "per", "tra", "fra", "il", "lo", "la", "i", "gli", "le"); CharArraySet stopWords = new CharArraySet(stopWordsList, true); Analyzer analyzer = new ItalianAnalyzer(stopWords); if (index) { System.out.format("Indexing Italian Wikipedia...%n"); long startIndex = System.currentTimeMillis(); IndexWriter indexWriter = new IndexWriter(directory, new IndexWriterConfig(analyzer)); importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current1.xml"), indexWriter); importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current2.xml"), indexWriter); importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current3.xml"), indexWriter); importWikipedia(new File(PREFIX + "/itwiki/itwiki-20150405-pages-meta-current4.xml"), indexWriter); long endIndex = System.currentTimeMillis(); System.out.format("Indexed %d pages in %ds %n", indexWriter.maxDoc(), (endIndex - startIndex) / 1000); indexWriter.close(); } if (split && !index) { reader = DirectoryReader.open(train); } else { reader = DirectoryReader.open(directory); } if (index && split) { // split the index System.out.format("Splitting the index...%n"); long startSplit = System.currentTimeMillis(); DatasetSplitter datasetSplitter = new DatasetSplitter(0.1, 0); for (LeafReaderContext context : reader.leaves()) { datasetSplitter.split(context.reader(), train, test, cv, analyzer, false, CATEGORY_FIELD, TEXT_FIELD, CATEGORY_FIELD); } reader.close(); reader = DirectoryReader.open(train); // using the train index from now on long endSplit = System.currentTimeMillis(); System.out.format("Splitting done in %ds %n", (endSplit - startSplit) / 1000); } final long startTime = System.currentTimeMillis(); List<Classifier<BytesRef>> classifiers = new LinkedList<>(); classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, 0, 0, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new BM25Similarity(), analyzer, null, 1, 0, 0, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, null, analyzer, null, 1, 0, 0, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new LMDirichletSimilarity(), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new LMJelinekMercerSimilarity(0.3f), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 0, 0, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new DFRSimilarity(new BasicModelG(), new AfterEffectB(), new NormalizationH1()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new DFRSimilarity(new BasicModelP(), new AfterEffectL(), new NormalizationH3()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new IBSimilarity(new DistributionSPL(), new LambdaDF(), new Normalization.NoNormalization()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestNeighborClassifier(reader, new IBSimilarity(new DistributionLL(), new LambdaTTF(), new NormalizationH1()), analyzer, null, 3, 1, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 1, 100)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 10, 1, 100)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 1, 100)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 100)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 15, 3, 300)); classifiers.add(new MinHashClassifier(reader, TEXT_FIELD, CATEGORY_FIELD, 5, 3, 100)); classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 3, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new ClassicSimilarity(), analyzer, null, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 3, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new KNearestFuzzyClassifier(reader, new BM25Similarity(), analyzer, null, 1, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new BM25NBClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new CachingNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD)); classifiers.add(new SimpleNaiveBayesClassifier(reader, analyzer, null, CATEGORY_FIELD, TEXT_FIELD)); int maxdoc; if (split) { testReader = DirectoryReader.open(test); maxdoc = testReader.maxDoc(); } else { maxdoc = reader.maxDoc(); } System.out.format("Starting evaluation on %d docs...%n", maxdoc); ExecutorService service = Executors.newCachedThreadPool(); List<Future<String>> futures = new LinkedList<>(); for (Classifier<BytesRef> classifier : classifiers) { final IndexReader finalReader = reader; final DirectoryReader finalTestReader = testReader; futures.add(service.submit(() -> { ConfusionMatrixGenerator.ConfusionMatrix confusionMatrix; if (split) { confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalTestReader, classifier, CATEGORY_FIELD, TEXT_FIELD, 60000 * 30); } else { confusionMatrix = ConfusionMatrixGenerator.getConfusionMatrix(finalReader, classifier, CATEGORY_FIELD, TEXT_FIELD, 60000 * 30); } final long endTime = System.currentTimeMillis(); final int elapse = (int) (endTime - startTime) / 1000; return " * " + classifier + " \n * accuracy = " + confusionMatrix.getAccuracy() + "\n * precision = " + confusionMatrix.getPrecision() + "\n * recall = " + confusionMatrix.getRecall() + "\n * f1-measure = " + confusionMatrix.getF1Measure() + "\n * avgClassificationTime = " + confusionMatrix.getAvgClassificationTime() + "\n * time = " + elapse + " (sec)\n "; })); } for (Future<String> f : futures) { System.out.println(f.get()); } Thread.sleep(10000); service.shutdown(); } finally { try { if (reader != null) { reader.close(); } if (directory != null) { directory.close(); } if (test != null) { test.close(); } if (train != null) { train.close(); } if (cv != null) { cv.close(); } if (testReader != null) { testReader.close(); } } catch (Throwable e) { e.printStackTrace(); } } }
From source file:com.globalsight.ling.tm2.lucene.LuceneIndexWriter.java
License:Apache License
/** * Removes all index for a Tm. This method removes all index files * and directories that belong to the specified Tm. This method * tries to remove Tm in a destructive fashion, meaning that, even * if the locks on the indexes have been obtained by other * process/thread, this method goes ahead and delete indexes. The * other process/thread with locks will get exceptions. * * @param p_tmId Tm id//from w ww. j a v a2s .co m */ static public void removeTm(long p_tmId) throws Exception { File tmIndexRoot = LuceneUtil.getGoldTmIndexParentDir(p_tmId); String[] localeNames = tmIndexRoot.list(); // localeNames can be null if no index has been created. if (localeNames != null) { for (int i = 0; i < localeNames.length; i++) { File directory = new File(tmIndexRoot, localeNames[i]); try { if (!directory.isDirectory()) { FileUtil.deleteFile(directory); continue; } // delete all index files and Lucene lock files FSDirectory fsDirectory = FSDirectory.open(directory); // delete our lock file Lock lock = fsDirectory.makeLock(LOCK_NAME); lock.release(); fsDirectory.close(); // delete index directory FileUtil.deleteFile(directory); // clean cache if have LuceneCache.cleanLuceneCache(directory); } catch (Exception e) { c_logger.error("Exception is thrown while removing a Tm index. Ignoring...", e); } } } // delete index root directory tmIndexRoot.delete(); }
From source file:com.leavesfly.lia.tool.BooksMoreLikeThis.java
License:Apache License
public static void main(String[] args) throws Throwable { String indexDir = System.getProperty("index.dir"); FSDirectory directory = FSDirectory.open(new File(indexDir)); IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); int numDocs = reader.maxDoc(); MoreLikeThis mlt = new MoreLikeThis(reader); // #A mlt.setFieldNames(new String[] { "title", "author" }); mlt.setMinTermFreq(1); // #B mlt.setMinDocFreq(1);//from ww w. j ava 2 s.com for (int docID = 0; docID < numDocs; docID++) { // #C System.out.println(); Document doc = reader.document(docID); System.out.println(doc.get("title")); Query query = mlt.like(docID); // #D System.out.println(" query=" + query); TopDocs similarDocs = searcher.search(query, 10); if (similarDocs.totalHits == 0) System.out.println(" None like this"); for (int i = 0; i < similarDocs.scoreDocs.length; i++) { if (similarDocs.scoreDocs[i].doc != docID) { // #E doc = reader.document(similarDocs.scoreDocs[i].doc); System.out.println(" -> " + doc.getField("title").stringValue()); } } } searcher.close(); reader.close(); directory.close(); }
From source file:com.mathworks.xzheng.tools.BooksMoreLikeThis.java
License:Apache License
public static void main(String[] args) throws Throwable { String indexDir = System.getProperty("index.dir"); FSDirectory directory = FSDirectory.open(new File(indexDir)); IndexReader reader = IndexReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); int numDocs = reader.maxDoc(); MoreLikeThis mlt = new MoreLikeThis(reader); // #A mlt.setFieldNames(new String[] { "title", "author" }); mlt.setMinTermFreq(1); // #B mlt.setMinDocFreq(1);/*from www . j a v a 2 s . com*/ for (int docID = 0; docID < numDocs; docID++) { // #C System.out.println(); Document doc = reader.document(docID); System.out.println(doc.get("title")); Query query = mlt.like(docID); // #D System.out.println(" query=" + query); TopDocs similarDocs = searcher.search(query, 10); if (similarDocs.totalHits == 0) System.out.println(" None like this"); for (int i = 0; i < similarDocs.scoreDocs.length; i++) { if (similarDocs.scoreDocs[i].doc != docID) { // #E doc = reader.document(similarDocs.scoreDocs[i].doc); System.out.println(" -> " + doc.getField("title").stringValue()); } } } reader.close(); directory.close(); }