Example usage for org.apache.lucene.analysis.cn.smart SmartChineseAnalyzer SmartChineseAnalyzer

List of usage examples for org.apache.lucene.analysis.cn.smart SmartChineseAnalyzer SmartChineseAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.cn.smart SmartChineseAnalyzer SmartChineseAnalyzer.

Prototype

public SmartChineseAnalyzer() 

Source Link

Document

Create a new SmartChineseAnalyzer, using the default stopword list.

Usage

From source file:com.adanac.module.blog.search.LuceneHelper.java

License:Apache License

private static void generateIndex(String path, String id, String title, String content,
        List<Map<String, String>> dataList) {
    try {// w w w . j  a va 2  s .c om
        Directory dir = FSDirectory.open(Paths.get(INDEX_PATH + path));
        Analyzer analyzer = new SmartChineseAnalyzer();
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(analyzer);
        indexWriterConfig.setOpenMode(OpenMode.CREATE);
        IndexWriter writer = new IndexWriter(dir, indexWriterConfig);
        for (Map<String, String> data : dataList) {
            Document document = new Document();
            Field idField = new IntField("id", Integer.valueOf(data.get(id)), Field.Store.YES);
            Field indexedContentField = new TextField("indexedContent",
                    data.get(title) + SEPARATOR + data.get(content), Field.Store.YES);
            document.add(idField);
            document.add(indexedContentField);
            writer.addDocument(document);
            if (logger.isInfoEnabled()) {
                logger.info("add index for : [" + data.get(title) + "]");
            }
        }
        writer.close();
    } catch (Exception e) {
        logger.error("add index failed ...", e);
    }
}

From source file:com.adanac.module.blog.search.LuceneHelper.java

License:Apache License

private static List<Map<String, String>> search(String searchText, String path, String title,
        LoadQuery loadQuery) {/*from   w w w  .  j a  va  2 s  .c o  m*/
    try {
        IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(INDEX_PATH + path)));
        IndexSearcher searcher = new IndexSearcher(reader);
        Analyzer analyzer = new SmartChineseAnalyzer();
        QueryParser parser = new QueryParser("indexedContent", analyzer);
        Query query = parser.parse(searchText);
        TopDocs resultDocs = searcher.search(query, 100);
        ScoreDoc[] scoreDocs = resultDocs.scoreDocs;
        //
        SimpleHTMLFormatter simpleHtmlFormatter = new SimpleHTMLFormatter("<span style=\"color:red;\">",
                "</span>");
        Highlighter highlighter = new Highlighter(simpleHtmlFormatter, new QueryScorer(query));
        highlighter.setTextFragmenter(new SimpleFragmenter(150));
        List<Map<String, String>> result = new ArrayList<>();
        List<Integer> idList = new ArrayList<>();
        for (int i = 0; i < scoreDocs.length; i++) {
            Document doc = searcher.doc(scoreDocs[i].doc);
            Integer id = Integer.valueOf(doc.get("id"));
            if (!idList.contains(id)) {
                String indexedContent = doc.get("indexedContent");
                TokenStream tokenStream = analyzer.tokenStream("indexedContent", indexedContent);
                Map<String, String> data = loadQuery.getById(id);
                String highlighterString = highlighter.getBestFragment(tokenStream, indexedContent);
                if (highlighterString.contains(SEPARATOR)) {
                    String[] array = highlighterString.split(SEPARATOR);
                    data.put(title, array[0]);
                    if (array.length > 1) {
                        data.put("summary", array[1]);
                    }
                } else {
                    data.put("summary", highlighterString);
                }
                result.add(data);
                idList.add(id);
            }
        }
        return result;
    } catch (Exception e) {
        logger.error("search failed ...", e);
    }
    return new ArrayList<>();
}

From source file:com.lin.studytest.lucene.IndexFiles.java

License:Apache License

/** Index all text files under a directory. */
public static void main(String[] args) {
    //      String usage = "java org.apache.lucene.demo.IndexFiles"
    //            + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
    //            + "This indexes the documents in DOCS_PATH, creating a Lucene index"
    //            + "in INDEX_PATH that can be searched with SearchFiles";
    String indexPath = "D:\\software\\lucene\\testdata\\indexpath";
    String docsPath = "D:\\software\\lucene\\testdata\\docpath";
    RAMDirectory ramDirectory = new RAMDirectory();

    boolean create = false;
    //      for(int i=0;i<args.length;i++) {
    //         if ("-index".equals(args[i])) {
    //            indexPath = args[i+1];
    //            i++;
    //         } else if ("-docs".equals(args[i])) {
    //            docsPath = args[i+1];
    //            i++;
    //         } else if ("-update".equals(args[i])) {
    //            create = false;
    //         }/*ww w. java  2s. c  om*/
    //      }

    //      if (docsPath == null) {
    //         System.err.println("Usage: " + usage);
    //         System.exit(1);
    //      }

    final Path docDir = Paths.get(docsPath);
    if (!Files.isReadable(docDir)) {
        System.out.println("Document directory '" + docDir.toAbsolutePath()
                + "' does not exist or is not readable, please check the path");
        System.exit(1);
    }

    Date start = new Date();
    try {
        System.out.println("Indexing to directory '" + indexPath + "'...");

        Directory dir = FSDirectory.open(Paths.get(indexPath));
        Analyzer analyzer = new SmartChineseAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

        if (create) {
            // Create a new index in the directory, removing any
            // previously indexed documents:
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            // Add new documents to an existing index:
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmx512m or -Xmx1g):
        //
        // iwc.setRAMBufferSizeMB(256.0);

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, docDir);

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        // writer.forceMerge(1);

        writer.close();

        Date end = new Date();
        System.out.println(end.getTime() - start.getTime() + " total milliseconds");

    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:net.simpleframework.ado.lucene.AbstractLuceneManager.java

License:Apache License

protected Analyzer getDefaultAnalyzer() {
    if (defaultAnalyzer == null) {
        defaultAnalyzer = new SmartChineseAnalyzer();
    }
    return defaultAnalyzer;
}

From source file:org.apache.nutch.indexwriter.lucene.LuceneWriter.java

License:Apache License

public void open(JobConf job, String name) throws IOException {
    this.fs = FileSystem.get(job);
    perm = new Path(FileOutputFormat.getOutputPath(job), name);
    temp = job.getLocalPath("index/_" + Integer.toString(new Random().nextInt()));
    fs.delete(perm, true); // delete old, if any
    analyzerFactory = new AnalyzerFactory(job);
    IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_4_10_2,
            new SmartChineseAnalyzer());
    LogByteSizeMergePolicy mergePolicy = new LogByteSizeMergePolicy();
    mergePolicy.setMergeFactor(job.getInt("indexer.mergeFactor", 10));
    mergePolicy.setMaxMergeDocs(job.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE));

    indexWriterConfig.setMergePolicy(mergePolicy);
    indexWriterConfig.setUseCompoundFile(false);
    indexWriterConfig.setTermIndexInterval(job.getInt("indexer.termIndexInterval", 128));
    indexWriterConfig.setMaxBufferedDocs(job.getInt("indexer.minMergeDocs", 100));
    indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    writer = new org.apache.lucene.index.IndexWriter(
            FSDirectory.open(new File(fs.startLocalOutput(perm, temp).toString())), indexWriterConfig);

    /*//from   w w w  .  j  a  v  a  2 s. co m
     * addFieldOptions("title", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job);
     * addFieldOptions("url", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job);
     * addFieldOptions("content", STORE.YES, INDEX.TOKENIZED, VECTOR.NO, job);
     * addFieldOptions("lang", STORE.YES, INDEX.UNTOKENIZED, VECTOR.NO, job);
     */

    processOptions(job);
}

From source file:org.elasticsearch.indices.analysis.smartcn.SmartChineseIndicesAnalysis.java

License:Apache License

@Inject
public SmartChineseIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) {
    super(settings);

    // Register smartcn analyzer
    indicesAnalysisService.analyzerProviderFactories().put("smartcn",
            new PreBuiltAnalyzerProviderFactory("smartcn", AnalyzerScope.INDICES, new SmartChineseAnalyzer()));

    // Register smartcn_tokenizer tokenizer
    indicesAnalysisService.tokenizerFactories().put("smartcn_tokenizer",
            new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
                @Override/*from   w w w  .j a  va 2  s .  co  m*/
                public String name() {
                    return "smartcn_tokenizer";
                }

                @Override
                public Tokenizer create() {
                    return new HMMChineseTokenizer();
                }
            }));

    // Register smartcn_sentence tokenizer -- for backwards compat an alias to smartcn_tokenizer
    indicesAnalysisService.tokenizerFactories().put("smartcn_sentence",
            new PreBuiltTokenizerFactoryFactory(new TokenizerFactory() {
                @Override
                public String name() {
                    return "smartcn_sentence";
                }

                @Override
                public Tokenizer create() {
                    return new HMMChineseTokenizer();
                }
            }));

    // Register smartcn_word token filter -- noop
    indicesAnalysisService.tokenFilterFactories().put("smartcn_word",
            new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() {
                @Override
                public String name() {
                    return "smartcn_word";
                }

                @Override
                public TokenStream create(TokenStream tokenStream) {
                    return tokenStream;
                }
            }));
}

From source file:org.open.crs.service.lucene.IndexFiles.java

License:Apache License

public IndexFiles(String indexPath) throws IOException {
    Directory dir = FSDirectory.open(Paths.get(indexPath));

    //        Analyzer analyzer = new StandardAnalyzer();
    //        Analyzer analyzer = new CJKAnalyzer();
    Analyzer analyzer = new SmartChineseAnalyzer();
    IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

    if (create) {
        // Create a new index in the directory, removing any
        // previously indexed documents:
        iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    } else {//from   w w w.j  av a  2  s  .co m
        // Add new documents to an existing index:
        iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
    }

    // Optional: for better indexing performance, if you
    // are indexing many documents, increase the RAM
    // buffer.  But if you do this, increase the max heap
    // size to the JVM (eg add -Xmx512m or -Xmx1g):
    //
    // iwc.setRAMBufferSizeMB(256.0);

    writer = new IndexWriter(dir, iwc);
}

From source file:org.open.crs.service.lucene.SearchFiles.java

License:Apache License

public SearchFiles(String indexPath) throws IOException {
    indexReader = DirectoryReader.open(FSDirectory.open(Paths.get(indexPath)));
    indexSearcher = new IndexSearcher(indexReader);
    //        indexSearcher.setSimilarity(new IKSimilarity());
    //        analyzer = new StandardAnalyzer();
    analyzer = new SmartChineseAnalyzer();

    if (queries != null) {
        bufferedReader = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8);
        //            bufferedReader = Files.newBufferedReader(Paths.get(queries), Charset.forName("GB2312"));
        //            bufferedReader = Files.newBufferedReader(Paths.get(queries), Charset.forName("GBK"));
        //            bufferedReader = Files.newBufferedReader(Paths.get(queries));
    } else {//from   w ww. j  a v  a2s.c  o m
        bufferedReader = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    }
    queryParser = new QueryParser(field, analyzer);
}

From source file:searchEngine.IndexFiles.java

License:Apache License

public static void Do(String[] args) {
    String usage = "java org.apache.lucene.demo.IndexFiles"
            + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n"
            + "This indexes the documents in DOCS_PATH, creating a Lucene index"
            + "in INDEX_PATH that can be searched with SearchFiles";
    String indexPath = "index";
    String docsPath = null;/*from  w ww.  j  a v a 2  s .c  om*/
    boolean create = true;
    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) {
            indexPath = args[i + 1];
            i++;
        } else if ("-docs".equals(args[i])) {
            docsPath = args[i + 1];
            i++;
        } else if ("-update".equals(args[i])) {
            create = false;
        }
    }

    if (docsPath == null) {
        System.err.println("Usage: " + usage);
        System.exit(1);
    }

    final Path docDir = Paths.get(docsPath);
    if (!Files.isReadable(docDir)) {
        System.out.println("Document directory '" + docDir.toAbsolutePath()
                + "' does not exist or is not readable, please check the path");
        System.exit(1);
    }

    Date start = new Date();
    try {
        System.out.println("Indexing to directory '" + indexPath + "'...");

        Directory dir = FSDirectory.open(Paths.get(indexPath));
        Analyzer analyzer = new SmartChineseAnalyzer();
        IndexWriterConfig iwc = new IndexWriterConfig(analyzer);

        if (create) {
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, docDir);

        writer.close();

        Date end = new Date();
        System.out.println(end.getTime() - start.getTime() + " total milliseconds");

    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:searchEngine.SearchFiles.java

License:Apache License

public static QueryResult DoOne(String[] args) throws Exception {
    String index = "index";
    String field = "contents";
    String queries = null;//from   ww  w . j  a v  a  2 s. c o  m
    int repeat = 0;
    boolean raw = false;
    String queryString = null;
    int hitsPerPage = 10;
    int pageNumber = 1;

    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) {
            index = args[i + 1];
            i++;
        } else if ("-field".equals(args[i])) {
            field = args[i + 1];
            i++;
        } else if ("-queries".equals(args[i])) {
            queries = args[i + 1];
            i++;
        } else if ("-query".equals(args[i])) {
            queryString = args[i + 1];
            i++;
        } else if ("-repeat".equals(args[i])) {
            repeat = Integer.parseInt(args[i + 1]);
            i++;
        } else if ("-raw".equals(args[i])) {
            raw = true;
        } else if ("-paging".equals(args[i])) {
            hitsPerPage = Integer.parseInt(args[i + 1]);
            if (hitsPerPage <= 0) {
                System.err.println("There must be at least 1 hit per page.");
                System.exit(1);
            }
            i++;
        } else if ("-pageNumber".equals(args[i])) {
            pageNumber = Integer.parseInt(args[i + 1]);
            if (pageNumber <= 0) {
                System.err.println("There must be at least 1 hit per page.");
                System.exit(1);
            }
            i++;
        }
    }
    QueryResult urls;
    IndexReader reader = DirectoryReader.open(FSDirectory.open(Paths.get(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    //Analyzer analyzer = new StandardAnalyzer();
    Analyzer analyzer = new SmartChineseAnalyzer();

    BufferedReader in = null;
    if (queries != null) {
        in = Files.newBufferedReader(Paths.get(queries), StandardCharsets.UTF_8);
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, StandardCharsets.UTF_8));
    }
    QueryParser parser = new QueryParser(field, analyzer);

    if (queries == null && queryString == null) {
        System.out.println("Enter query: ");
    }

    String line = queryString != null ? queryString : in.readLine();

    if (line == null || line.length() == -1) {
        return new QueryResult();
    }

    line = line.trim();
    if (line.length() == 0) {
        return new QueryResult();
    }

    Query query = parser.parse(line);
    System.out.println("Searching for: " + query.toString(field));

    if (repeat > 0) {
        Date start = new Date();
        for (int i = 0; i < repeat; i++) {
            searcher.search(query, 100);
        }
        Date end = new Date();
        System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");
    }

    urls = doPagingSearch(in, searcher, query, hitsPerPage, pageNumber, raw,
            queries == null && queryString == null);
    //
    //          List<String> ursl = urls.getUrls();
    //          for(int i = 0; i < ursl.size(); i++)
    //          {
    //             ursl.set(i, ursl.get(i).replace("clean", ""));
    //          }
    if (queryString != null) {
        return urls;
    }
    reader.close();
    return urls;
}