List of usage examples for org.apache.lucene.analysis.cn.smart SmartChineseAnalyzer SmartChineseAnalyzer
public SmartChineseAnalyzer()
From source file:com.adanac.module.blog.search.LuceneHelper.java
License:Apache License
private static void generateIndex(String path, String id, String title, String content, List<Map<String, String>> dataList) { try {// w w w . j a va 2 s .c om Directory dir = FSDirectory.open(Paths.get(INDEX_PATH + path)); Analyzer analyzer = new SmartChineseAnalyzer(); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer); indexWriterConfig.setOpenMode(OpenMode.CREATE); IndexWriter writer = new IndexWriter(dir, indexWriterConfig); for (Map<String, String> data : dataList) { Document document = new Document(); Field idField = new IntField("id", Integer.valueOf(data.get(id)), Field.Store.YES); Field indexedContentField = new TextField("indexedContent", data.get(title) + SEPARATOR + data.get(content), Field.Store.YES); document.add(idField); document.add(indexedContentField); writer.addDocument(document); if (logger.isInfoEnabled()) { logger.info("add index for : [" + data.get(title) + "]"); } } writer.close(); } catch (Exception e) { logger.error("add index failed ...", e); } }
From source file:com.adanac.module.blog.search.LuceneHelper.java
License:Apache License
private static List<Map<String, String>> search(String searchText, String path, String title, LoadQuery loadQuery) {/*from w w w . j a va 2 s .c o m*/ try { IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(INDEX_PATH + path))); IndexSearcher searcher = new IndexSearcher(reader); Analyzer analyzer = new SmartChineseAnalyzer(); QueryParser parser = new QueryParser("indexedContent", analyzer); Query query = parser.parse(searchText); TopDocs resultDocs = searcher.search(query, 100); ScoreDoc[] scoreDocs = resultDocs.scoreDocs; // SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<span style=\"color:red;\">", "</span>"); Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query)); highlighter.setTextFragmenter(new SimpleFragmenter(150)); List<Map<String, String>> result = new ArrayList<>(); List<Integer> idList = new ArrayList<>(); for (int i = 0; i < scoreDocs.length; i++) { Document doc = searcher.doc(scoreDocs[i].doc); Integer id = Integer.valueOf(doc.get("id")); if (!idList.contains(id)) { String indexedContent = doc.get("indexedContent"); TokenStream tokenStream = analyzer.tokenStream("indexedContent", indexedContent); Map<String, String> data = loadQuery.getById(id); String highlighterString = highlighter.getBestFragment(tokenStream, indexedContent); if (highlighterString.contains(SEPARATOR)) { String[] array = highlighterString.split(SEPARATOR); data.put(title, array[0]); if (array.length > 1) { data.put("summary", array[1]); } } else { data.put("summary", highlighterString); } result.add(data); idList.add(id); } } return result; } catch (Exception e) { logger.error("search failed ...", e); } return new ArrayList<>(); }
From source file:com.lin.studytest.lucene.IndexFiles.java
License:Apache License
/** Index all text files under a directory. */ public static void main(String[] args) { // String usage = "java org.apache.lucene.demo.IndexFiles" // + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" // + "This indexes the documents in DOCS_PATH, creating a Lucene index" // + "in INDEX_PATH that can be searched with SearchFiles"; String indexPath = "D:\\software\\lucene\\testdata\\indexpath"; String docsPath = "D:\\software\\lucene\\testdata\\docpath"; RAMDirectory ramDirectory = new RAMDirectory(); boolean create = false; // for(int i=0;i<args.length;i++) { // if ("-index".equals(args[i])) { // indexPath = args[i+1]; // i++; // } else if ("-docs".equals(args[i])) { // docsPath = args[i+1]; // i++; // } else if ("-update".equals(args[i])) { // create = false; // }/*ww w. java 2s. c om*/ // } // if (docsPath == null) { // System.err.println("Usage: " + usage); // System.exit(1); // } final Path docDir = Paths.get(docsPath); if (!Files.isReadable(docDir)) { System.out.println("Document directory '" + docDir.toAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(Paths.get(indexPath)); Analyzer analyzer = new SmartChineseAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(OpenMode.CREATE); } else { // Add new documents to an existing index: iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); // NOTE: if you want to maximize search performance, // you can optionally call forceMerge here. This can be // a terribly costly operation, so generally it's only // worth it when your index is relatively static (ie // you're done adding documents to it): // // writer.forceMerge(1); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:net.simpleframework.ado.lucene.AbstractLuceneManager.java
License:Apache License
protected Analyzer getDefaultAnalyzer() { if (defaultAnalyzer == null) { defaultAnalyzer = new SmartChineseAnalyzer(); } return defaultAnalyzer; }
From source file:org.apache.nutch.indexwriter.lucene.LuceneWriter.java
License:Apache License
public void open(JobConf job, String name) throws IOException { this.fs = FileSystem.get(job); perm = new Path(FileOutputFormat.getOutputPath(job), name); temp = job.getLocalPath("index/_" + Integer.toString(new Random().nextInt())); fs.delete(perm, true); // delete old, if any analyzerFactory = new AnalyzerFactory(job); IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_4_10_2, new SmartChineseAnalyzer()); LogByteSizeMergePolicy mergePolicy = new LogByteSizeMergePolicy(); mergePolicy.setMergeFactor(job.getInt("indexer.mergeFactor", 10)); mergePolicy.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE)); indexWriterConfig.setMergePolicy(mergePolicy); indexWriterConfig.setUseCompoundFile(false); indexWriterConfig.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128)); indexWriterConfig.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100)); indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); writer = new org.apache.lucene.index.IndexWriter( FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())), indexWriterConfig); /*//from w w w . j a v a 2 s. co m * addFieldOptions("title", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job); * addFieldOptions("url", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job); * addFieldOptions("content", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job); * addFieldOptions("lang", STORE.YES, INDEX.UNTOKENIZED, VECTOR.NO, job); */ processOptions(job); }
From source file:org.elasticsearch.indices.analysis.smartcn.SmartChineseIndicesAnalysis.java
License:Apache License
@Inject public SmartChineseIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) { super(settings); // Register smartcn analyzer indicesAnalysisService.analyzerProviderFactories().put("smartcn", new PreBuiltAnalyzerProviderFactory("smartcn", AnalyzerScope.INDICES, new SmartChineseAnalyzer())); // Register smartcn_tokenizer tokenizer indicesAnalysisService.tokenizerFactories().put("smartcn_tokenizer", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { @Override/*from w w w .j a va 2 s . co m*/ public String name() { return "smartcn_tokenizer"; } @Override public Tokenizer create() { return new HMMChineseTokenizer(); } })); // Register smartcn_sentence tokenizer -- for backwards compat an alias to smartcn_tokenizer indicesAnalysisService.tokenizerFactories().put("smartcn_sentence", new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() { @Override public String name() { return "smartcn_sentence"; } @Override public Tokenizer create() { return new HMMChineseTokenizer(); } })); // Register smartcn_word token filter -- noop indicesAnalysisService.tokenFilterFactories().put("smartcn_word", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "smartcn_word"; } @Override public TokenStream create(TokenStream tokenStream) { return tokenStream; } })); }
From source file:org.open.crs.service.lucene.IndexFiles.java
License:Apache License
public IndexFiles(String indexPath) throws IOException { Directory dir = FSDirectory.open(Paths.get(indexPath)); // Analyzer analyzer = new StandardAnalyzer(); // Analyzer analyzer = new CJKAnalyzer(); Analyzer analyzer = new SmartChineseAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (create) { // Create a new index in the directory, removing any // previously indexed documents: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE); } else {//from w w w.j av a 2 s .co m // Add new documents to an existing index: iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); } // Optional: for better indexing performance, if you // are indexing many documents, increase the RAM // buffer. But if you do this, increase the max heap // size to the JVM (eg add -Xmx512m or -Xmx1g): // // iwc.setRAMBufferSizeMB(256.0); writer = new IndexWriter(dir, iwc); }
From source file:org.open.crs.service.lucene.SearchFiles.java
License:Apache License
public SearchFiles(String indexPath) throws IOException { indexReader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath))); indexSearcher = new IndexSearcher(indexReader); // indexSearcher.setSimilarity(new IKSimilarity()); // analyzer = new StandardAnalyzer(); analyzer = new SmartChineseAnalyzer(); if (queries != null) { bufferedReader = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8); // bufferedReader = Files.newBufferedReader(Paths.get(queries), Charset.forName("GB2312")); // bufferedReader = Files.newBufferedReader(Paths.get(queries), Charset.forName("GBK")); // bufferedReader = Files.newBufferedReader(Paths.get(queries)); } else {//from w ww. j a v a2s.c o m bufferedReader = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); } queryParser = new QueryParser(field, analyzer); }
From source file:searchEngine.IndexFiles.java
License:Apache License
public static void Do(String[] args) { String usage = "java org.apache.lucene.demo.IndexFiles" + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" + "This indexes the documents in DOCS_PATH, creating a Lucene index" + "in INDEX_PATH that can be searched with SearchFiles"; String indexPath = "index"; String docsPath = null;/*from w ww. j a v a 2 s .c om*/ boolean create = true; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { indexPath = args[i + 1]; i++; } else if ("-docs".equals(args[i])) { docsPath = args[i + 1]; i++; } else if ("-update".equals(args[i])) { create = false; } } if (docsPath == null) { System.err.println("Usage: " + usage); System.exit(1); } final Path docDir = Paths.get(docsPath); if (!Files.isReadable(docDir)) { System.out.println("Document directory '" + docDir.toAbsolutePath() + "' does not exist or is not readable, please check the path"); System.exit(1); } Date start = new Date(); try { System.out.println("Indexing to directory '" + indexPath + "'..."); Directory dir = FSDirectory.open(Paths.get(indexPath)); Analyzer analyzer = new SmartChineseAnalyzer(); IndexWriterConfig iwc = new IndexWriterConfig(analyzer); if (create) { iwc.setOpenMode(OpenMode.CREATE); } else { iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); } IndexWriter writer = new IndexWriter(dir, iwc); indexDocs(writer, docDir); writer.close(); Date end = new Date(); System.out.println(end.getTime() - start.getTime() + " total milliseconds"); } catch (IOException e) { System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage()); } }
From source file:searchEngine.SearchFiles.java
License:Apache License
public static QueryResult DoOne(String[] args) throws Exception { String index = "index"; String field = "contents"; String queries = null;//from ww w . j a v a 2 s. c o m int repeat = 0; boolean raw = false; String queryString = null; int hitsPerPage = 10; int pageNumber = 1; for (int i = 0; i < args.length; i++) { if ("-index".equals(args[i])) { index = args[i + 1]; i++; } else if ("-field".equals(args[i])) { field = args[i + 1]; i++; } else if ("-queries".equals(args[i])) { queries = args[i + 1]; i++; } else if ("-query".equals(args[i])) { queryString = args[i + 1]; i++; } else if ("-repeat".equals(args[i])) { repeat = Integer.parseInt(args[i + 1]); i++; } else if ("-raw".equals(args[i])) { raw = true; } else if ("-paging".equals(args[i])) { hitsPerPage = Integer.parseInt(args[i + 1]); if (hitsPerPage <= 0) { System.err.println("There must be at least 1 hit per page."); System.exit(1); } i++; } else if ("-pageNumber".equals(args[i])) { pageNumber = Integer.parseInt(args[i + 1]); if (pageNumber <= 0) { System.err.println("There must be at least 1 hit per page."); System.exit(1); } i++; } } QueryResult urls; IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index))); IndexSearcher searcher = new IndexSearcher(reader); //Analyzer analyzer = new StandardAnalyzer(); Analyzer analyzer = new SmartChineseAnalyzer(); BufferedReader in = null; if (queries != null) { in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8); } else { in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8)); } QueryParser parser = new QueryParser(field, analyzer); if (queries == null && queryString == null) { System.out.println("Enter query: "); } String line = queryString != null ? queryString : in.readLine(); if (line == null || line.length() == -1) { return new QueryResult(); } line = line.trim(); if (line.length() == 0) { return new QueryResult(); } Query query = parser.parse(line); System.out.println("Searching for: " + query.toString(field)); if (repeat > 0) { Date start = new Date(); for (int i = 0; i < repeat; i++) { searcher.search(query, 100); } Date end = new Date(); System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms"); } urls = doPagingSearch(in, searcher, query, hitsPerPage, pageNumber, raw, queries == null && queryString == null); // // List<String> ursl = urls.getUrls(); // for(int i = 0; i < ursl.size(); i++) // { // ursl.set(i, ursl.get(i).replace("clean", "")); // } if (queryString != null) { return urls; } reader.close(); return urls; }