List of usage examples for org.apache.lucene.analysis.cn.smart SmartChineseAnalyzer SmartChineseAnalyzer
public SmartChineseAnalyzer(CharArraySet stopWords)
Create a new SmartChineseAnalyzer, using the provided Set of stopwords.
From source file:com.quest.agent.weibomonitor.weiboMonitorAgentImpl.java
License:Open Source License
private ModelRoot collect(long collectionFreqInMs, String groupName, String sqlQuery, int groupID) { Weibo agentRoot = new Weibo(groupName); //TODO: collect data and populate the data collected to model(topology) //List<UrlList> urlList = mWrapper.getUrlList(); Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT); try {//w w w. ja va 2 s . co m SQLProcess sql = new SQLProcess(); ResultSet res = sql.executeQuery(sqlQuery); MyTokenFilter tkFilter = new MyTokenFilter(); while (res.next()) { Reader sentence = new StringReader(res.getString("status").toString()); String weiboID = res.getObject("weiboId").toString(); if (groupID == 0) sql.execute("update status set status.read=1 where weiboId=" + weiboID + ";"); else sql.execute("update status2 set status2.read=1 where weiboId=" + weiboID + ";"); TokenStream ts = ca.tokenStream("", sentence); try { while (ts.incrementToken()) { String ss[] = ts.toString().split(","); ss[0] = ss[0].replace("(", ""); if (tkFilter.doFilter(ss[0])) { if (!map[groupID].containsKey(ss[0])) map[groupID].put(ss[0], new Word(1, ss[0])); else map[groupID].get(ss[0]).plusNum(); } } } catch (IOException e) { mLogger.debug2("error occurred while incrementToken", e); } } } catch (SQLException e) { // TODO Auto-generated catch block e.printStackTrace(); } Word[] wordList = tfidf.doProcess(map[groupID]); int mapsize = map[groupID].size(); for (int i = 0; i < Math.min(mapsize, ItemNumShow); i++) { collectWeibo(wordList[i].getWord(), wordList[i].getNum(), wordList[i].getTfIdf(), wordList[i].getIdf(), agentRoot); } return agentRoot; }
From source file:com.sg.business.vault.index.demo.IndexFiles.java
License:Apache License
/** Index all text files under a directory. */ public static void main(String[] args) { String usage = "java org.apache.lucene.demo.IndexFiles" //$NON-NLS-1$ + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" //$NON-NLS-1$ + "This indexes the documents in DOCS_PATH, creating a Lucene index" //$NON-NLS-1$ + "in INDEX_PATH that can be searched with SearchFiles"; //$NON-NLS-1$ String indexPath = "index"; //$NON-NLS-1$ String docsPath = null;/* w w w .j a va2s . c o m*/ boolean create = true; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { //$NON-NLS-1$ indexPath = args[i + 1]; i++; } else if ("-docs".equals(args[i])) { //$NON-NLS-1$ docsPath = args[i + 1]; i++; } else if ("-update".equals(args[i])) { //$NON-NLS-1$ create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); //$NON-NLS-1$ System.exit(1); } final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() //$NON-NLS-1$ + "' does not exist or is not readable, please check the path"); //$NON-NLS-1$ System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); //$NON-NLS-1$ //$NON-NLS-2$ Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_44); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); //$NON-NLS-1$ } catch (IOException e) { System.out.println(" caught a " + e.getClass() + //$NON-NLS-1$ "\n with message: " + e.getMessage()); //$NON-NLS-1$ } }
From source file:com.sg.business.vault.index.demo.SearchFiles.java
License:Apache License
/** Simple command-line based search demo. */ public static void main(String[] args) throws Exception { String usage = "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage]\n\nSee http://lucene.apache.org/core/4_1_0/demo/ for details."; //$NON-NLS-1$ if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) { //$NON-NLS-1$ //$NON-NLS-2$ System.out.println(usage); System.exit(0);//ww w. java2s . co m } String index = "index"; //$NON-NLS-1$ String field = "contents"; //$NON-NLS-1$ String queries = null; int repeat = 0; boolean raw = false; String queryString = null; int hitsPerPage = 10; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { //$NON-NLS-1$ index = args[i + 1]; i++; } else if ("-field".equals(args[i])) { //$NON-NLS-1$ field = args[i + 1]; i++; } else if ("-queries".equals(args[i])) { //$NON-NLS-1$ queries = args[i + 1]; i++; } else if ("-query".equals(args[i])) { //$NON-NLS-1$ queryString = args[i + 1]; i++; } else if ("-repeat".equals(args[i])) { //$NON-NLS-1$ repeat = Integer.parseInt(args[i + 1]); i++; } else if ("-raw".equals(args[i])) { //$NON-NLS-1$ raw = true; } else if ("-paging".equals(args[i])) { //$NON-NLS-1$ hitsPerPage = Integer.parseInt(args[i + 1]); if (hitsPerPage <= 0) { System.err.println("There must be at least 1 hit per page."); //$NON-NLS-1$ System.exit(1); } i++; } } IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_44); BufferedReader in = null; if (queries != null) { in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8")); //$NON-NLS-1$ } else { in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); //$NON-NLS-1$ } QueryParser parser = new QueryParser(Version.LUCENE_44, field, analyzer); while (true) { if (queries == null && queryString == null) { // prompt the user System.out.println("Enter query: "); //$NON-NLS-1$ } String line = queryString != null ? queryString : in.readLine(); if (line == null || line.length() == -1) { break; } line = line.trim(); if (line.length() == 0) { break; } Query query = parser.parse(line); System.out.println("Searching for: " + query.toString(field)); //$NON-NLS-1$ if (repeat > 0) { // repeat & time as benchmark Date start = new Date(); for (int i = 0; i < repeat; i++) { searcher.search(query, null, 100); } Date end = new Date(); System.out.println("Time: " + (end.getTime() - start.getTime()) //$NON-NLS-1$ + "ms"); //$NON-NLS-1$ } doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null); if (queryString != null) { break; } } reader.close(); }
From source file:com.sxc.lucene.index.IndexingTest.java
License:Apache License
private IndexWriter getWriter() throws IOException { // 2 IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, new SmartChineseAnalyzer(Version.LUCENE_47)); return new IndexWriter(directory, config); // 2 }
From source file:com.sxc.lucene.searching.BasicSearchingTest.java
License:Apache License
public void testQueryParser() throws Exception { IndexReader reader = DirectoryReader.open(directory); // A IndexSearcher searcher = new IndexSearcher(reader); // B QueryParser parser = new QueryParser(Version.LUCENE_47, "contents", new SmartChineseAnalyzer(Version.LUCENE_47)); Query query = parser.parse("* OR *"); TopDocs docs = searcher.search(query, 10); assertEquals(2, docs.totalHits);/*from w w w. j a v a 2 s . co m*/ Document d = searcher.doc(docs.scoreDocs[0].doc); assertEquals("", d.get("country")); directory.close(); }
From source file:com.sxc.lucene.searching.PhraseQueryTest.java
License:Apache License
protected void setUp() throws IOException { dir = FSDirectory.open(new File("D:/programming/lucene/PhraseQueryTest")); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, new SmartChineseAnalyzer(Version.LUCENE_47)); config.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, config); Document doc = new Document(); doc.add(new TextField("field", // 1 "the quick brown fox jumped over the lazy dog", // 1 Field.Store.YES)); // 1 writer.addDocument(doc);//from www. j a va2 s .c om writer.close(); searcher = new IndexSearcher(DirectoryReader.open(dir)); }
From source file:com.yangxu.searchengine.index.IndexFiles.java
License:Apache License
/** * /*from w w w . jav a 2 s . c o m*/ * @param createOrUpdate * create update */ public void createIndex(boolean createOrUpdate) { if (docsPath == null) { System.err.println("docsPath not exists!"); System.exit(1); } final File docDir = new File(docsPath); if (!docDir.exists() || !docDir.canRead()) { System.out.println("Document directory '" + docDir.getAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(new File(indexPath)); Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_31); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer); if (createOrUpdate) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:com.yangxu.searchengine.search.SearchFiles.java
License:Apache License
public void makeSearch(String field, String queries, int hitsPerPage) throws Exception { int repeat = 10; boolean raw = false; String queryString = null;/*from w w w . ja v a 2 s . c o m*/ if (hitsPerPage <= 0) { System.err.println("There must be at least 1 hit per page."); System.exit(1); } IndexReader reader = IndexReader.open(FSDirectory.open(new File(indexPath))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_31); // BufferedReader in = null; if (queries != null) { in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8")); } else { in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); } QueryParser parser = new QueryParser(Version.LUCENE_31, field, analyzer); while (true) { if (queries == null && queryString == null) { // prompt the user System.out.println("Enter query: "); } String line = queryString != null ? queryString : in.readLine(); if (line == null || line.length() == -1) { break; } line = line.trim(); if (line.length() == 0) { break; } Query query = parser.parse(line); System.out.println("Searching for: " + query.toString(field)); if (repeat > 0) { // repeat & time as benchmark Date start = new Date(); for (int i = 0; i < repeat; i++) { searcher.search(query, null, 100); } Date end = new Date(); System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms"); } doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null); if (queryString != null) { break; } } searcher.close(); reader.close(); }
From source file:fr.ericlab.sondy.core.access.IndexAccess.java
License:Open Source License
/** * * @param appVariables/*from ww w.j ava 2 s .co m*/ */ public IndexAccess(AppVariables appVariables) { try { Analyzer analyzer; if (appVariables.stemmingLanguage.equalsIgnoreCase("Standard")) { analyzer = new StandardAnalyzer(Version.LUCENE_36); } else { if (appVariables.stemmingLanguage.equals("Chinese")) { analyzer = new SmartChineseAnalyzer(Version.LUCENE_36); } else { String packageName = appVariables.stemmingLanguage.substring(0, 2).toLowerCase(); Class cl = Class.forName("org.apache.lucene.analysis." + packageName + "." + appVariables.stemmingLanguage + "Analyzer"); Class[] types = new Class[] { Version.class, Set.class }; Constructor ct = cl.getConstructor(types); analyzer = (Analyzer) ct.newInstance(Version.LUCENE_36); } } FSDirectory indexDiscret = FSDirectory.open(new File(appVariables.configuration.getWorkspace() + "/datasets/" + appVariables.currentDatasetText.getText() + "/" + appVariables.getCurrentDatasetDiscretization())); IndexWriterConfig configDiscret = new IndexWriterConfig(Version.LUCENE_36, analyzer); writer = new IndexWriter(indexDiscret, configDiscret); reader = IndexReader.open(writer, true); } catch (IOException ex) { Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex); } catch (ClassNotFoundException | NoSuchMethodException | SecurityException | InstantiationException | IllegalAccessException | IllegalArgumentException | InvocationTargetException ex) { Logger.getLogger(IndexAccess.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:fr.ericlab.sondy.core.access.MentionIndexAccess.java
License:Open Source License
/** * * @param appVariables/*from ww w. j ava2s.co m*/ */ public MentionIndexAccess(AppVariables appVariables) { try { Analyzer analyzer; if (appVariables.stemmingLanguage.equalsIgnoreCase("Standard")) { analyzer = new StandardAnalyzer(Version.LUCENE_36); } else { if (appVariables.stemmingLanguage.equals("Chinese")) { analyzer = new SmartChineseAnalyzer(Version.LUCENE_36); } else { String packageName = appVariables.stemmingLanguage.substring(0, 2).toLowerCase(); Class cl = Class.forName("org.apache.lucene.analysis." + packageName + "." + appVariables.stemmingLanguage + "Analyzer"); Class[] types = new Class[] { Version.class, Set.class }; Constructor ct = cl.getConstructor(types); analyzer = (Analyzer) ct.newInstance(Version.LUCENE_36); } } FSDirectory indexDiscret = FSDirectory.open(new File(appVariables.configuration.getWorkspace() + "/datasets/" + appVariables.currentDatasetText.getText() + "/" + appVariables.getCurrentDatasetDiscretization() + "-m")); IndexWriterConfig configDiscret = new IndexWriterConfig(Version.LUCENE_36, analyzer); mentionWriter = new IndexWriter(indexDiscret, configDiscret); mentionReader = IndexReader.open(mentionWriter, true); } catch (IOException ex) { Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex); } catch (ClassNotFoundException | NoSuchMethodException | SecurityException | InstantiationException | IllegalAccessException | IllegalArgumentException | InvocationTargetException ex) { Logger.getLogger(MentionIndexAccess.class.getName()).log(Level.SEVERE, null, ex); } }