List of usage examples for org.apache.lucene.analysis CharArraySet CharArraySet
public CharArraySet(Collection<?> c, boolean ignoreCase)
From source file:org.karsha.tokenize.DefaultTokenizer.java
License:Open Source License
public DefaultTokenizer() { this.lu_stop_words = new CharArraySet(10, false); this.te_stop_words = new CharArraySet(10, false); for (int i = 0; i < LUCENE_STOP_WORDS.length; i++) { lu_stop_words.add(LUCENE_STOP_WORDS[i]); }/*from w w w. ja v a 2 s . co m*/ for (int i = 0; i < TERRIER_STOP_WORDS.length; i++) { te_stop_words.add(TERRIER_STOP_WORDS[i]); } }
From source file:org.lambda3.indra.pp.StandardPreProcessorIterator.java
License:Open Source License
private TokenStream getStopFilter(String lang, Set<String> metadataStopWords, TokenStream stream) { if (metadataStopWords != null && !metadataStopWords.isEmpty()) { return new StopFilter(stream, new CharArraySet(metadataStopWords, false)); } else {/*from w ww . j a v a2 s .co m*/ try { Set<String> sws = getDefaultStopWordSet(lang); if (sws != null) { CharArraySet stopWords = new CharArraySet(30, true); stopWords.addAll(sws); return new StopFilter(stream, stopWords); } } catch (IndraException e) { throw new IndraRuntimeException(String.format("Error creating stop filter for lang '%s'", lang), e); } } return stream; }
From source file:org.tallison.lucene.contrast.QueryToCorpusContraster.java
License:Apache License
public List<TermIDF> contrast(Query query, String fieldName, int numResults) throws IOException { TopScoreDocCollector results = TopScoreDocCollector.create(maxDocs, maxDocs + 10000); searcher.search(query, results);// w w w . j a va2s .co m ScoreDoc[] scoreDocs = results.topDocs().scoreDocs; //if there are fewer documents than minTermFreq //return empty list now if (scoreDocs.length < minTermFreq) { return new ArrayList<TermIDF>(); } //total hack int initialSize = scoreDocs.length * 100; CharArrayMap<MutableValueInt> map = new CharArrayMap<MutableValueInt>(initialSize, ignoreCase); CharArraySet tmpSet = new CharArraySet(100, ignoreCase); Set<String> selector = new HashSet<String>(); selector.add(fieldName); for (ScoreDoc scoreDoc : scoreDocs) { //get terms from doc processDoc(scoreDoc.doc, fieldName, selector, tmpSet); //now update global doc freqs Iterator<Object> it = tmpSet.iterator(); while (it.hasNext()) { char[] token = (char[]) it.next(); MutableValueInt docCount = map.get(token, 0, token.length); if (docCount == null) { docCount = new MutableValueInt(); docCount.value = 1; } else { docCount.value++; } map.put(token, docCount); } tmpSet.clear(); } return getResults(fieldName, map, numResults); }
From source file:reviews.indexing.IndexReviews.java
License:Apache License
private static CharArraySet readStopWords(String filename) { CharArraySet stopwords = new CharArraySet(StopAnalyzer.ENGLISH_STOP_WORDS_SET, true); try {//from www . j a v a 2 s .co m BufferedReader br; br = new BufferedReader(new FileReader(new File(filename))); String line; while ((line = br.readLine()) != null) { stopwords.add(line.trim()); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return stopwords; }
From source file:ri.AnalyzerNuevo.java
@Override protected TokenStreamComponents createComponents(String string) { //To change body of generated methods, choose Tools | Templates. final Tokenizer source = new StandardTokenizer(); Reader reader = new StringReader(string); source.setReader(reader);//w w w . ja va2 s . c om //SynonymMap.Builder builder = new SynonymMap.Builder(true); //builder.add(new CharsRef("text"), new CharsRef("documento"), true); //SynonymMap synonymMap; TokenStream pipeline = source; pipeline = new StandardFilter(pipeline); pipeline = new EnglishPossessiveFilter(pipeline); /*try { synonymMap = builder.build(); pipeline = new SynonymFilter(pipeline,synonymMap,true); } catch (IOException ex) { Logger.getLogger(AnalyzerNuevo.class.getName()).log(Level.SEVERE, null, ex); }*/ pipeline = new ASCIIFoldingFilter(pipeline); pipeline = new LowerCaseFilter(pipeline); pipeline = new StopFilter(pipeline, new CharArraySet(stopwords, true)); pipeline = new PorterStemFilter(pipeline); return new TokenStreamComponents(source, pipeline); }
From source file:tarefa1.IndexFiles.java
License:Apache License
/** Index all text files under a directory. */ public static void main(String[] args) { String usage = "java org.apache.lucene.demo.IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles"; String indexPath = "index"; //String docsPath = null; -- pega o caminho do main boolean create = true; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1];//from ww w . ja v a 2 s. com i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } else if ("-update".equals(args[i])) { create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final Path docDir = Paths.get(docsPath); if (!Files.isReadable(docDir)) { System.out.println("Document directory '" + docDir.toAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(Paths.get(indexPath)); //Analyzer analyzer = new StandardAnalyzer(); Analyzer analyzer; if (stoplist) { if (stemming) { analyzer = new EnglishAnalyzer(); } else { analyzer = new StandardAnalyzer(); } } else { String[] stopWordsArray = {}; CharArraySet stopWords = new CharArraySet(2, true); stopWords.addAll(Arrays.asList(stopWordsArray)); if (stemming) { analyzer = new EnglishAnalyzer(stopWords); } else { analyzer = new StandardAnalyzer(stopWords); } } IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:tarefa1.SearchFiles.java
License:Apache License
/** Simple command-line based search demo. */ public static void main(String[] args) throws Exception { String usage = "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage]\n\nSee http://lucene.apache.org/core/4_1_0/demo/ for details."; if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) { System.out.println(usage); System.exit(0);/* w w w . j a v a2s .c o m*/ } String index = "index"; String field = "contents"; String queries = null; int repeat = 0; boolean raw = false; String queryString = null; int hitsPerPage = 10; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { index = args[i + 1]; i++; } else if ("-field".equals(args[i])) { field = args[i + 1]; i++; } else if ("-queries".equals(args[i])) { queries = args[i + 1]; i++; } else if ("-query".equals(args[i])) { queryString = args[i + 1]; i++; } else if ("-repeat".equals(args[i])) { repeat = Integer.parseInt(args[i + 1]); i++; } else if ("-raw".equals(args[i])) { raw = true; } else if ("-paging".equals(args[i])) { hitsPerPage = Integer.parseInt(args[i + 1]); if (hitsPerPage <= 0) { System.err.println("There must be at least 1 hit per page."); System.exit(1); } i++; } } IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer; if (stoplist) { if (stemming) { analyzer = new EnglishAnalyzer(); } else { analyzer = new StandardAnalyzer(); } } else { String[] stopWordsArray = {}; CharArraySet stopWords = new CharArraySet(2, true); stopWords.addAll(Arrays.asList(stopWordsArray)); if (stemming) { analyzer = new EnglishAnalyzer(stopWords); } else { analyzer = new StandardAnalyzer(stopWords); } } BufferedReader in = null; if (queries != null) { in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8); } else { in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); } QueryParser parser = new QueryParser(field, analyzer); while (true) { if (queries == null && queryString == null) { // prompt the user System.out.println("Enter query: "); } String line = queryString != null ? queryString : in.readLine(); if (line == null || line.length() == -1) { break; } line = line.trim(); if (line.length() == 0) { break; } Query query = parser.parse(line); System.out.println("Searching for: " + query.toString(field)); if (repeat > 0) { // repeat & time as benchmark Date start = new Date(); for (int i = 0; i < repeat; i++) { searcher.search(query, 100); } Date end = new Date(); System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms"); } doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null); if (queryString != null) { break; } } reader.close(); }