List of usage examples for org.apache.lucene.analysis.hi HindiAnalyzer HindiAnalyzer
public HindiAnalyzer()
From source file:com.devb.search.IndicIndexer.java
License:Apache License
@Override public void makeIndex() { String indexPath = servletContext.getRealPath("/") + "/hindex/"; String docsPath = servletContext.getRealPath("/") + "/hdocs/"; boolean create = true; final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path\n"); return;/*from w ww . jav a2s . com*/ } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'...\n"); org.apache.lucene.store.Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new HindiAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(null, analyzer); if (create) { iwc.setOpenMode(OpenMode.CREATE); } else { iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } IndexWriter writer = new IndexWriter(dir, iwc); if (docDir.canRead()) { if (docDir.isDirectory()) { String[] files = docDir.list(); if (files != null) { for (int i = 0; i < files.length; i++) { File file = new File(docDir, files[i]); FileInputStream fileInputStream = new FileInputStream(file); BufferedReader reader = new BufferedReader( new InputStreamReader(fileInputStream, "UTF-8")); Tokenizer tokenizer = new StandardTokenizer(reader); CharTermAttribute termAtt = tokenizer.addAttribute(CharTermAttribute.class); tokenizer.reset(); int lineNumber = 0; try { while (tokenizer.incrementToken()) { Document doc = new Document(); Field pathField = new StringField("path", file.getName(), Field.Store.YES); doc.add(pathField); TextField nField = new TextField("linenumber", new Integer(++lineNumber).toString(), Store.YES); doc.add(nField); TextField field = new TextField("contents", termAtt.toString(), Store.YES); doc.add(field); writer.addDocument(doc); } System.out.println("Adding " + file + "\n"); } catch (Exception e) { e.printStackTrace(); } finally { tokenizer.close(); reader.close(); fileInputStream.close(); } } } } } writer.close(); Date end = new Date(); System.out.println((end.getTime() - start.getTime()) + " total milliseconds\n"); } catch (IOException e) { System.out.println("Caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:com.devb.search.IndicSearcher.java
License:Apache License
private void callSearch(boolean j) { System.out.println("Servlet Ctx " + servletContext.getRealPath("/")); String indexPath = servletContext.getRealPath("/") + "/hindex/"; String docsPath = servletContext.getRealPath("/") + "/hdocs/"; final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, " + "please check the path\n"); return;// w w w .j a v a 2s .c o m } IndexReader reader = null; IndexSearcher searcher = null; Analyzer analyzer = null; String field = "contents"; try { reader = DirectoryReader.open(FSDirectory.open(new File(indexPath))); searcher = new IndexSearcher(reader); analyzer = new HindiAnalyzer(); } catch (IOException ioe) { ioe.printStackTrace(); } QueryParser parser = new QueryParser(field, analyzer); String /*ByteBuffer*/ line = null; Query query = null; try { // line = Charset.forName("UTF-8").encode(this.id); line = this.id; // line = line.trim(); if (line == null) { return; } System.out.println("Hindi StandardSearcher / callSearch Line " + line); query = parser.parse(line); System.out.println("Hindi StandardSearcher / callSearch Hindi Query " + query); final int maxHits = 10; ScoreDoc[] hits = searcher.search(query, null, maxHits).scoreDocs; try { // Iterate through the results: for (int i = 0; i < hits.length; i++) { Document hitDoc = searcher.doc(hits[i].doc); if (j) { JSONObject jo = new JSONObject(); jo.put("query", query.toString(field)); jo.put("path", hitDoc.get("path")); jo.put("line", hitDoc.get("linenumber")); jo.put("contents", hitDoc.get("contents")); ja.put(jo); } else { SearchResult ns = new SearchResult(); ns.setQuery(query.toString(field)); ns.setDocPath(hitDoc.get("path")); ns.setLineNum(hitDoc.get("linenumber")); ns.setContents(hitDoc.get("contents")); contentProvider.put(String.valueOf(i), ns); } } } catch (Exception ito) { ito.printStackTrace(); } } catch (Exception ex) { ex.printStackTrace(); } try { reader.close(); } catch (IOException ioe) { } }
From source file:com.work.IndexFiles.java
License:Apache License
/** Index all text files under a directory. */ public static void main(String[] args) { // English//from ww w . j ava 2s . co m // String indexPath = "C:/Users/Harish/Desktop/IR/Data/Data English/masc_500k_texts/written/letters/index/one/"; // String docsPath = "C:/Users/Harish/Desktop/IR/Data/Data English/masc_500k_texts/written/letters/files/"; // Analyzer analyzer = new StandardAnalyzer(); //Hindi String indexPath = "C:/Users/Harish/Desktop/IR/Data/Hindi Data/hin_corp_unicode/index/one/"; String docsPath = "C:/Users/Harish/Desktop/IR/Data/Hindi Data/hin_corp_unicode/sample/"; Analyzer analyzer = new HindiAnalyzer(); //Chinese // Analyzer analyzer = new CJKAnalyzer(); boolean create = true; final Path docDir = Paths.get(docsPath); if (!Files.isReadable(docDir)) { System.out.println("Document directory '" + docDir.toAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(Paths.get(indexPath)); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:com.work.SearchFiles.java
License:Apache License
/** Simple command-line based search demo. */ public static void main(String[] args) throws Exception { // String indexPath = "C:/Users/Harish/Desktop/IR/Data/Data English/masc_500k_texts/written/letters/index/one/"; // String queries = "C:/Users/Harish/Desktop/IR/Data/Data English/masc_500k_texts/written/letters/query/query1.txt"; // Analyzer analyzer = new StandardAnalyzer(); //Hindi//from w w w . ja va 2s . c o m String indexPath = "C:/Users/Harish/Desktop/IR/Data/Hindi Data/hin_corp_unicode/index/one/"; // String queries = "C:/Users/Harish/Desktop/IR/Data/Hindi Data/hin_corp_unicode/query/one.txt"; String queries = null; Analyzer analyzer = new HindiAnalyzer(); //Chinese // Analyzer analyzer = new CJKAnalyzer(); String index = indexPath; String field = "contents"; String a = "???"; int repeat = 0; boolean raw = false; String queryString = null; int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index))); IndexSearcher searcher = new IndexSearcher(reader); BufferedReader in = null; if (queries != null) { in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8); } else { in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); } QueryParser parser = new QueryParser(field, analyzer); while (true) { if (queries == null && queryString == null) { // prompt the user System.out.println("Enter query: "); } String line = queryString != null ? queryString : in.readLine(); if (line == null || line.length() == -1) { break; } line = line.trim(); if (line.length() == 0) { break; } Query query = parser.parse(line); System.out.println("Searching for: " + query.toString(field)); if (repeat > 0) { // repeat & time as benchmark Date start = new Date(); for (int i = 0; i < repeat; i++) { searcher.search(query, 100); } Date end = new Date(); System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms"); } doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null); if (queryString != null) { break; } } reader.close(); }
From source file:de.mirkosertic.desktopsearch.AnalyzerCache.java
License:Open Source License
public AnalyzerCache(Configuration aConfiguration) { standardAnalyzer = configure(new StandardAnalyzer()); analyzerByLanguage = new HashMap<>(); registerIfEnabled(SupportedLanguage.ar, aConfiguration, configure(new ArabicAnalyzer())); registerIfEnabled(SupportedLanguage.bg, aConfiguration, configure(new BulgarianAnalyzer())); registerIfEnabled(SupportedLanguage.br, aConfiguration, configure(new BrazilianAnalyzer())); registerIfEnabled(SupportedLanguage.ca, aConfiguration, configure(new CatalanAnalyzer())); registerIfEnabled(SupportedLanguage.ckb, aConfiguration, configure(new SoraniAnalyzer())); registerIfEnabled(SupportedLanguage.cz, aConfiguration, configure(new CzechAnalyzer())); registerIfEnabled(SupportedLanguage.da, aConfiguration, configure(new DanishAnalyzer())); registerIfEnabled(SupportedLanguage.de, aConfiguration, configure(new GermanAnalyzer())); registerIfEnabled(SupportedLanguage.el, aConfiguration, configure(new GreekAnalyzer())); registerIfEnabled(SupportedLanguage.en, aConfiguration, configure(new EnglishAnalyzer())); registerIfEnabled(SupportedLanguage.es, aConfiguration, configure(new SpanishAnalyzer())); registerIfEnabled(SupportedLanguage.eu, aConfiguration, configure(new BasqueAnalyzer())); registerIfEnabled(SupportedLanguage.fa, aConfiguration, configure(new PersianAnalyzer())); registerIfEnabled(SupportedLanguage.fi, aConfiguration, configure(new FinnishAnalyzer())); registerIfEnabled(SupportedLanguage.fr, aConfiguration, configure(new FrenchAnalyzer())); registerIfEnabled(SupportedLanguage.ga, aConfiguration, configure(new IrishAnalyzer())); registerIfEnabled(SupportedLanguage.gl, aConfiguration, configure(new GalicianAnalyzer())); registerIfEnabled(SupportedLanguage.hi, aConfiguration, configure(new HindiAnalyzer())); registerIfEnabled(SupportedLanguage.hu, aConfiguration, configure(new HungarianAnalyzer())); registerIfEnabled(SupportedLanguage.hy, aConfiguration, configure(new ArmenianAnalyzer())); registerIfEnabled(SupportedLanguage.id, aConfiguration, configure(new IndonesianAnalyzer())); registerIfEnabled(SupportedLanguage.it, aConfiguration, configure(new ItalianAnalyzer())); registerIfEnabled(SupportedLanguage.lv, aConfiguration, configure(new LatvianAnalyzer())); registerIfEnabled(SupportedLanguage.nl, aConfiguration, configure(new DutchAnalyzer())); registerIfEnabled(SupportedLanguage.no, aConfiguration, configure(new NorwegianAnalyzer())); registerIfEnabled(SupportedLanguage.pt, aConfiguration, configure(new PortugueseAnalyzer())); registerIfEnabled(SupportedLanguage.ro, aConfiguration, configure(new RomanianAnalyzer())); registerIfEnabled(SupportedLanguage.ru, aConfiguration, configure(new RussianAnalyzer())); registerIfEnabled(SupportedLanguage.sv, aConfiguration, configure(new SwedishAnalyzer())); registerIfEnabled(SupportedLanguage.th, aConfiguration, configure(new ThaiAnalyzer())); registerIfEnabled(SupportedLanguage.tr, aConfiguration, configure(new TurkishAnalyzer())); }