List of usage examples for org.apache.lucene.util Version LUCENE_CURRENT
Version LUCENE_CURRENT
To view the source code for org.apache.lucene.util Version LUCENE_CURRENT.
Click Source Link
From source file:SimpleCompileTest.java
License:Apache License
@Test public void testStartNode() { Node build = SimpleNodeFactory.node("foobar"); Version.LUCENE_CURRENT.onOrAfter(Version.LUCENE_CURRENT); assert true;//from ww w .ja v a 2 s .c o m }
From source file:aplicacion.sistema.indexer.test.IndexFiles.java
License:Apache License
/** Index all text files under a directory. */ public static void main(String[] args) { String usage = "java org.apache.lucene.demo.IndexFiles <root_directory>"; /*Agustin lo comento * if (args.length == 0) {// ww w . jav a 2 s. com System.err.println("Usage: " + usage); System.exit(1); }*/ if (INDEX_DIR.exists()) { System.out.println("Cannot save index to '" + INDEX_DIR + "' directory, please delete it first"); System.exit(1); } final File docDir = new File("E:/indexer"); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { IndexWriter writer = new IndexWriter(FSDirectory.open(INDEX_DIR), new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED); System.out.println("Indexing to directory '" + INDEX_DIR + "'..."); indexDocs(writer, docDir); System.out.println("Optimizing..."); writer.optimize(); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:aplicacion.sistema.indexer.test.SearchFiles.java
License:Apache License
/** Simple command-line based search demo. */ public static void main(String[] args) throws Exception { String usage = "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-raw] [-norms field] [-paging hitsPerPage]"; usage += "\n\tSpecify 'false' for hitsPerPage to use streaming instead of paging search."; if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) { System.out.println(usage); System.exit(0);//from w w w.j a va 2s. c o m } String index = "e:/index"; String field = "contents"; String queries = null; int repeat = 0; boolean raw = false; String normsField = null; boolean paging = true; int hitsPerPage = 10; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { index = args[i + 1]; i++; } else if ("-field".equals(args[i])) { field = args[i + 1]; i++; } else if ("-queries".equals(args[i])) { queries = args[i + 1]; i++; } else if ("-repeat".equals(args[i])) { repeat = Integer.parseInt(args[i + 1]); i++; } else if ("-raw".equals(args[i])) { raw = true; } else if ("-norms".equals(args[i])) { normsField = args[i + 1]; i++; } else if ("-paging".equals(args[i])) { if (args[i + 1].equals("false")) { paging = false; } else { hitsPerPage = Integer.parseInt(args[i + 1]); if (hitsPerPage == 0) { paging = false; } } i++; } } IndexReader reader = IndexReader.open(FSDirectory.open(new File(index)), true); // only searching, so read-only=true if (normsField != null) reader = new OneNormsReader(reader, normsField); Searcher searcher = new IndexSearcher(reader); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); BufferedReader in = null; if (queries != null) { in = new BufferedReader(new FileReader(queries)); } else { in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); } QueryParser parser = new QueryParser(Version.LUCENE_CURRENT, field, analyzer); while (true) { if (queries == null) // prompt the user System.out.println("Enter query: "); String line = in.readLine(); if (line == null || line.length() == -1) break; line = line.trim(); if (line.length() == 0) break; Query query = parser.parse(line); System.out.println("Searching for: " + query.toString(field)); if (repeat > 0) { // repeat & time as benchmark Date start = new Date(); for (int i = 0; i < repeat; i++) { searcher.search(query, null, 100); } Date end = new Date(); System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms"); } if (paging) { doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null); } else { doStreamingSearch(searcher, query); } } reader.close(); }
From source file:back.Indexer.java
License:Apache License
/** Index all text files under a directory. */ public static void main(String[] args) { String usage = "java org.apache.lucene.demo.IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles"; String indexPath = ".\\indexed"; String docsPath = ".//artigos"; boolean create = true; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1];//from w w w . j ava2s . co m i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } else if ("-update".equals(args[i])) { create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT, new CharArraySet(Version.LUCENE_CURRENT, 0, false)); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_40, analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:back.Searcher.java
License:Apache License
/** Simple command-line based search demo. */ public static void search(String query, boolean stopword, boolean stemming, int consulta) throws Exception { String index = null;/*from www . j a v a 2s. c o m*/ Analyzer analyzer = null; if (!stopword && !stemming) { index = ".\\indexed"; analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT, new CharArraySet(Version.LUCENE_CURRENT, 0, false)); System.out.println("Nenhum Marcado"); } else if (stopword && !stemming) { index = ".\\indexedNoStpWrd"; analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT); System.out.println("Primeiro Marcado"); } else if (!stopword && stemming) { index = ".\\indexedStemming"; analyzer = new EnglishAnalyzer(Version.LUCENE_CURRENT, new CharArraySet(Version.LUCENE_CURRENT, 0, false)); System.out.println("Segundo Marcado"); } else if (stopword && stemming) { index = ".\\indexedTreated"; analyzer = new EnglishAnalyzer(Version.LUCENE_CURRENT); System.out.println("Dois Marcados"); } String field = "contents"; String queries = null; int repeat = 0; boolean raw = false; String queryString = query; int hitsPerPage = 200; CSVReader CSVreader = new CSVReader(new FileReader(".\\matriz.csv")); List<String[]> myEntries = CSVreader.readAll(); IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index))); IndexSearcher searcher = new IndexSearcher(reader); BufferedReader in = null; if (queries != null) { in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8")); } else { in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); } QueryParser parser = new QueryParser(Version.LUCENE_40, field, analyzer); while (true) { if (queries == null && queryString == null) { // prompt the user System.out.println("Enter query: "); } String line = queryString != null ? queryString : in.readLine(); if (line == null || line.length() == -1) { break; } line = line.trim(); if (line.length() == 0) { break; } Query query1 = parser.parse(line); System.out.println("Searching for: " + query1.toString(field)); if (repeat > 0) { // repeat & time as benchmark Date start = new Date(); for (int i = 0; i < repeat; i++) { searcher.search(query1, null, 100); } Date end = new Date(); System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms"); } doPagingSearch(in, searcher, query1, hitsPerPage, raw, queries == null && queryString == null, myEntries, consulta); if (queryString != null) { break; } } reader.close(); }
From source file:be.iRail.BeLaws.Indexer.java
License:Apache License
/** Index all text files under a directory. */ private void index() { INDEX_DIR = new File(indexpath); final File docDir = new File(lawspath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"); } else {//w w w . j a va2 s.com Date start = new Date(); try { IndexWriter writer = new IndexWriter(FSDirectory.open(INDEX_DIR), new StandardAnalyzer(Version.LUCENE_CURRENT), true, IndexWriter.MaxFieldLength.LIMITED); System.out.println("Indexing to directory '" + INDEX_DIR + "'..."); indexDocs(writer, docDir); System.out.println("Optimizing..."); writer.optimize(); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } } }
From source file:cn.edu.thss.iise.beehivez.server.util.StringSimilarityUtil.java
License:Open Source License
/** * tokenize the given string, all the words are extracted, lowercased, all * the stop words are removed, and all the words are replaced with their * stem//w ww .j a v a2 s. co m * * @param label * @return */ public static HashSet<String> snowballTokenize(String label) { HashSet<String> ret = new HashSet<String>(); try { Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English", StandardAnalyzer.STOP_WORDS_SET); TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label)); TermAttribute termAtt = stream.addAttribute(TermAttribute.class); stream.reset(); while (stream.incrementToken()) { ret.add(termAtt.term()); } stream.end(); stream.close(); } catch (Exception e) { e.printStackTrace(); } return ret; }
From source file:com.appspot.socialinquirer.server.service.impl.AnalysisServiceImpl.java
License:Apache License
@Override public List<Tag> getTermVector(String title, String text) { RAMDirectory directory = null;/*from w w w. ja v a 2s .co m*/ IndexReader reader = null; Map<String, Tag> tagsMap = new HashMap<String, Tag>(); try { directory = new RAMDirectory(); IndexWriter writer = new IndexWriter(directory, new StandardAnalyzer(Version.LUCENE_CURRENT), true, MaxFieldLength.UNLIMITED); Document doc = new Document(); doc.add(new Field("title", title, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); doc.add(new Field("body", stripHtmlTags(text, true), Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); writer.addDocument(doc); writer.close(); reader = IndexReader.open(directory, true); int numDocs = reader.maxDoc(); for (int i = 0; i < numDocs; i++) { TermFreqVector termFreqVector = reader.getTermFreqVector(i, "title"); pullTags(termFreqVector, tagsMap); termFreqVector = reader.getTermFreqVector(i, "body"); pullTags(termFreqVector, tagsMap); } } catch (Exception e) { logger.log(Level.SEVERE, "An error occured while pulling tags from text.", e); } finally { closeIndexReader(reader); closeRAMDirectory(directory); } ArrayList<Tag> tagsList = new ArrayList<Tag>(tagsMap.values()); Collections.sort(tagsList, new Comparator<Tag>() { @Override public int compare(Tag o1, Tag o2) { return o2.getFreqency() - o1.getFreqency(); } }); return tagsList; }
From source file:com.appspot.socialinquirer.server.service.impl.AnalysisServiceImpl.java
License:Apache License
/** * Creates the english analyzer./*from w ww. ja v a 2 s. c om*/ * * @return the analyzer */ private static Analyzer createEnglishAnalyzer() { return new StandardAnalyzer(Version.LUCENE_CURRENT); // return new StandardAnalyzer(Version.LUCENE_CURRENT) { // public TokenStream tokenStream(String fieldName, Reader reader) { // TokenStream result = super.tokenStream(fieldName, reader); // result = new SnowballFilter(result, "English"); // return result; // } // }; }
From source file:com.bigdata.search.DefaultAnalyzerFactory.java
License:Open Source License
/** * Initializes the various kinds of analyzers that we know about. * <p>//from w ww.j a va 2 s .c om * Note: Each {@link Analyzer} is registered under both the 3 letter and the * 2 letter language codes. See <a * href="http://www.loc.gov/standards/iso639-2/php/code_list.php">ISO 639-2</a>. * * @todo get some informed advice on which {@link Analyzer}s map onto which * language codes. * * @todo thread safety? Analyzers produce token processors so maybe there is * no problem here once things are initialized. If so, maybe this * could be static. * * @todo configuration. Could be configured by a file containing a class * name and a list of codes that are handled by that class. * * @todo strip language code down to 2/3 characters during lookup. * * @todo There are a lot of pidgins based on french, english, and other * languages that are not being assigned here. */ synchronized private Map<String, AnalyzerConstructor> getAnalyzers() { if (analyzers != null) { return analyzers; } analyzers = new HashMap<String, AnalyzerConstructor>(); final Set<?> emptyStopwords = Collections.EMPTY_SET; { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new BrazilianAnalyzer(Version.LUCENE_CURRENT) : new BrazilianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("por", a); analyzers.put("pt", a); } /* * Claims to handle Chinese. Does single character extraction. Claims to * produce smaller indices as a result. * * Note: you can not tokenize with the Chinese analyzer and the do * search using the CJK analyzer and visa versa. * * Note: I have no idea whether this would work for Japanese and Korean * as well. I expect so, but no real clue. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return new ChineseAnalyzer(); } }; analyzers.put("zho", a); analyzers.put("chi", a); analyzers.put("zh", a); } /* * Claims to handle Chinese, Japanese, Korean. Does double character * extraction with overlap. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CJKAnalyzer(Version.LUCENE_CURRENT) : new CJKAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; // analyzers.put("zho", a); // analyzers.put("chi", a); // analyzers.put("zh", a); analyzers.put("jpn", a); analyzers.put("ja", a); analyzers.put("jpn", a); analyzers.put("kor", a); analyzers.put("ko", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new CzechAnalyzer(Version.LUCENE_CURRENT) : new CzechAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("ces", a); analyzers.put("cze", a); analyzers.put("cs", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new DutchAnalyzer(Version.LUCENE_CURRENT) : new DutchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("dut", a); analyzers.put("nld", a); analyzers.put("nl", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new FrenchAnalyzer(Version.LUCENE_CURRENT) : new FrenchAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("fra", a); analyzers.put("fre", a); analyzers.put("fr", a); } /* * Note: There are a lot of language codes for German variants that * might be useful here. */ { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GermanAnalyzer(Version.LUCENE_CURRENT) : new GermanAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("deu", a); analyzers.put("ger", a); analyzers.put("de", a); } // Note: ancient greek has a different code (grc). { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new GreekAnalyzer(Version.LUCENE_CURRENT) : new GreekAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("gre", a); analyzers.put("ell", a); analyzers.put("el", a); } // @todo what about other Cyrillic scripts? { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new RussianAnalyzer(Version.LUCENE_CURRENT) : new RussianAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("rus", a); analyzers.put("ru", a); } { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return new ThaiAnalyzer(Version.LUCENE_CURRENT); } }; analyzers.put("tha", a); analyzers.put("th", a); } // English { AnalyzerConstructor a = new AnalyzerConstructor() { public Analyzer newInstance(final boolean filterStopwords) { return filterStopwords ? new StandardAnalyzer(Version.LUCENE_CURRENT) : new StandardAnalyzer(Version.LUCENE_CURRENT, emptyStopwords); } }; analyzers.put("eng", a); analyzers.put("en", a); /* * Note: There MUST be an entry under the empty string (""). This * entry will be requested when there is no entry for the specified * language code. */ analyzers.put("", a); } return analyzers; }