Example usage for org.apache.lucene.analysis.cn.smart SmartChineseAnalyzer SmartChineseAnalyzer

List of usage examples for org.apache.lucene.analysis.cn.smart SmartChineseAnalyzer SmartChineseAnalyzer

Introduction

In this page you can find the example usage for org.apache.lucene.analysis.cn.smart SmartChineseAnalyzer SmartChineseAnalyzer.

Prototype

public SmartChineseAnalyzer(CharArraySet stopWords) 

Source Link

Document

Create a new SmartChineseAnalyzer, using the provided Set of stopwords.

Usage

From source file:com.quest.agent.weibomonitor.weiboMonitorAgentImpl.java

License:Open Source License

private ModelRoot collect(long collectionFreqInMs, String groupName, String sqlQuery, int groupID) {
    Weibo agentRoot = new Weibo(groupName);
    //TODO: collect data and populate the data collected to model(topology) 

    //List<UrlList> urlList = mWrapper.getUrlList();

    Analyzer ca = new SmartChineseAnalyzer(Version.LUCENE_CURRENT);

    try {//w  w  w.  ja va  2  s . co  m

        SQLProcess sql = new SQLProcess();

        ResultSet res = sql.executeQuery(sqlQuery);

        MyTokenFilter tkFilter = new MyTokenFilter();

        while (res.next()) {
            Reader sentence = new StringReader(res.getString("status").toString());

            String weiboID = res.getObject("weiboId").toString();

            if (groupID == 0)
                sql.execute("update status set status.read=1 where weiboId=" + weiboID + ";");
            else
                sql.execute("update status2 set status2.read=1 where weiboId=" + weiboID + ";");

            TokenStream ts = ca.tokenStream("", sentence);
            try {
                while (ts.incrementToken()) {
                    String ss[] = ts.toString().split(",");
                    ss[0] = ss[0].replace("(", "");
                    if (tkFilter.doFilter(ss[0])) {
                        if (!map[groupID].containsKey(ss[0]))
                            map[groupID].put(ss[0], new Word(1, ss[0]));
                        else
                            map[groupID].get(ss[0]).plusNum();
                    }
                }
            } catch (IOException e) {
                mLogger.debug2("error occurred while incrementToken", e);
            }
        }
    } catch (SQLException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }

    Word[] wordList = tfidf.doProcess(map[groupID]);

    int mapsize = map[groupID].size();
    for (int i = 0; i < Math.min(mapsize, ItemNumShow); i++) {

        collectWeibo(wordList[i].getWord(), wordList[i].getNum(), wordList[i].getTfIdf(), wordList[i].getIdf(),
                agentRoot);

    }

    return agentRoot;
}

From source file:com.sg.business.vault.index.demo.IndexFiles.java

License:Apache License

/** Index all text files under a directory. */
public static void main(String[] args) {
    String usage = "java org.apache.lucene.demo.IndexFiles" //$NON-NLS-1$
            + " [-index INDEX_PATH] [-docs DOCS_PATH] [-update]\n\n" //$NON-NLS-1$
            + "This indexes the documents in DOCS_PATH, creating a Lucene index" //$NON-NLS-1$
            + "in INDEX_PATH that can be searched with SearchFiles"; //$NON-NLS-1$
    String indexPath = "index"; //$NON-NLS-1$
    String docsPath = null;/*  w  w w  .j  a  va2s  .  c  o m*/
    boolean create = true;
    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) { //$NON-NLS-1$
            indexPath = args[i + 1];
            i++;
        } else if ("-docs".equals(args[i])) { //$NON-NLS-1$
            docsPath = args[i + 1];
            i++;
        } else if ("-update".equals(args[i])) { //$NON-NLS-1$
            create = false;
        }
    }

    if (docsPath == null) {
        System.err.println("Usage: " + usage); //$NON-NLS-1$
        System.exit(1);
    }

    final File docDir = new File(docsPath);
    if (!docDir.exists() || !docDir.canRead()) {
        System.out.println("Document directory '" + docDir.getAbsolutePath() //$NON-NLS-1$
                + "' does not exist or is not readable, please check the path"); //$NON-NLS-1$
        System.exit(1);
    }

    Date start = new Date();
    try {
        System.out.println("Indexing to directory '" + indexPath + "'..."); //$NON-NLS-1$ //$NON-NLS-2$

        Directory dir = FSDirectory.open(new File(indexPath));
        Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_44);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer);

        if (create) {
            // Create a new index in the directory, removing any
            // previously indexed documents:
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            // Add new documents to an existing index:
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer.  But if you do this, increase the max heap
        // size to the JVM (eg add -Xmx512m or -Xmx1g):
        //
        // iwc.setRAMBufferSizeMB(256.0);

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, docDir);

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here.  This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        // writer.forceMerge(1);

        writer.close();

        Date end = new Date();
        System.out.println(end.getTime() - start.getTime() + " total milliseconds"); //$NON-NLS-1$

    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + //$NON-NLS-1$
                "\n with message: " + e.getMessage()); //$NON-NLS-1$
    }
}

From source file:com.sg.business.vault.index.demo.SearchFiles.java

License:Apache License

/** Simple command-line based search demo. */
public static void main(String[] args) throws Exception {
    String usage = "Usage:\tjava org.apache.lucene.demo.SearchFiles [-index dir] [-field f] [-repeat n] [-queries file] [-query string] [-raw] [-paging hitsPerPage]\n\nSee http://lucene.apache.org/core/4_1_0/demo/ for details."; //$NON-NLS-1$
    if (args.length > 0 && ("-h".equals(args[0]) || "-help".equals(args[0]))) { //$NON-NLS-1$ //$NON-NLS-2$
        System.out.println(usage);
        System.exit(0);//ww  w.  java2s . co  m
    }

    String index = "index"; //$NON-NLS-1$
    String field = "contents"; //$NON-NLS-1$
    String queries = null;
    int repeat = 0;
    boolean raw = false;
    String queryString = null;
    int hitsPerPage = 10;

    for (int i = 0; i < args.length; i++) {
        if ("-index".equals(args[i])) { //$NON-NLS-1$
            index = args[i + 1];
            i++;
        } else if ("-field".equals(args[i])) { //$NON-NLS-1$
            field = args[i + 1];
            i++;
        } else if ("-queries".equals(args[i])) { //$NON-NLS-1$
            queries = args[i + 1];
            i++;
        } else if ("-query".equals(args[i])) { //$NON-NLS-1$
            queryString = args[i + 1];
            i++;
        } else if ("-repeat".equals(args[i])) { //$NON-NLS-1$
            repeat = Integer.parseInt(args[i + 1]);
            i++;
        } else if ("-raw".equals(args[i])) { //$NON-NLS-1$
            raw = true;
        } else if ("-paging".equals(args[i])) { //$NON-NLS-1$
            hitsPerPage = Integer.parseInt(args[i + 1]);
            if (hitsPerPage <= 0) {
                System.err.println("There must be at least 1 hit per page."); //$NON-NLS-1$
                System.exit(1);
            }
            i++;
        }
    }

    IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(index)));
    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_44);

    BufferedReader in = null;
    if (queries != null) {
        in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8")); //$NON-NLS-1$
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); //$NON-NLS-1$
    }
    QueryParser parser = new QueryParser(Version.LUCENE_44, field, analyzer);
    while (true) {
        if (queries == null && queryString == null) { // prompt the user
            System.out.println("Enter query: "); //$NON-NLS-1$
        }

        String line = queryString != null ? queryString : in.readLine();

        if (line == null || line.length() == -1) {
            break;
        }

        line = line.trim();
        if (line.length() == 0) {
            break;
        }

        Query query = parser.parse(line);
        System.out.println("Searching for: " + query.toString(field)); //$NON-NLS-1$

        if (repeat > 0) { // repeat & time as benchmark
            Date start = new Date();
            for (int i = 0; i < repeat; i++) {
                searcher.search(query, null, 100);
            }
            Date end = new Date();
            System.out.println("Time: " + (end.getTime() - start.getTime()) //$NON-NLS-1$
                    + "ms"); //$NON-NLS-1$
        }

        doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);

        if (queryString != null) {
            break;
        }
    }
    reader.close();
}

From source file:com.sxc.lucene.index.IndexingTest.java

License:Apache License

private IndexWriter getWriter() throws IOException { // 2
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47,
            new SmartChineseAnalyzer(Version.LUCENE_47));
    return new IndexWriter(directory, config); // 2

}

From source file:com.sxc.lucene.searching.BasicSearchingTest.java

License:Apache License

public void testQueryParser() throws Exception {
    IndexReader reader = DirectoryReader.open(directory); // A
    IndexSearcher searcher = new IndexSearcher(reader); // B

    QueryParser parser = new QueryParser(Version.LUCENE_47, "contents",
            new SmartChineseAnalyzer(Version.LUCENE_47));

    Query query = parser.parse("* OR *");
    TopDocs docs = searcher.search(query, 10);
    assertEquals(2, docs.totalHits);/*from   w  w w.  j a v  a  2 s . co m*/
    Document d = searcher.doc(docs.scoreDocs[0].doc);
    assertEquals("", d.get("country"));

    directory.close();
}

From source file:com.sxc.lucene.searching.PhraseQueryTest.java

License:Apache License

protected void setUp() throws IOException {
    dir = FSDirectory.open(new File("D:/programming/lucene/PhraseQueryTest"));
    IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47,
            new SmartChineseAnalyzer(Version.LUCENE_47));
    config.setOpenMode(OpenMode.CREATE);
    IndexWriter writer = new IndexWriter(dir, config);

    Document doc = new Document();
    doc.add(new TextField("field", // 1
            "the quick brown fox jumped over the lazy dog", // 1
            Field.Store.YES)); // 1
    writer.addDocument(doc);//from  www. j  a  va2 s .c  om
    writer.close();

    searcher = new IndexSearcher(DirectoryReader.open(dir));
}

From source file:com.yangxu.searchengine.index.IndexFiles.java

License:Apache License

/**
 * /*from w w w  .  jav a  2 s  .  c  o  m*/
 * @param createOrUpdate
 *             create update
 */
public void createIndex(boolean createOrUpdate) {
    if (docsPath == null) {
        System.err.println("docsPath not exists!");
        System.exit(1);
    }

    final File docDir = new File(docsPath);
    if (!docDir.exists() || !docDir.canRead()) {
        System.out.println("Document directory '" + docDir.getAbsolutePath()
                + "' does not exist or is not readable, please check the path");
        System.exit(1);
    }

    Date start = new Date();
    try {
        System.out.println("Indexing to directory '" + indexPath + "'...");

        Directory dir = FSDirectory.open(new File(indexPath));
        Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_31);
        IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_31, analyzer);

        if (createOrUpdate) {
            // Create a new index in the directory, removing any
            // previously indexed documents:
            iwc.setOpenMode(OpenMode.CREATE);
        } else {
            // Add new documents to an existing index:
            iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);
        }

        // Optional: for better indexing performance, if you
        // are indexing many documents, increase the RAM
        // buffer. But if you do this, increase the max heap
        // size to the JVM (eg add -Xmx512m or -Xmx1g):
        //
        // iwc.setRAMBufferSizeMB(256.0);

        IndexWriter writer = new IndexWriter(dir, iwc);
        indexDocs(writer, docDir);

        // NOTE: if you want to maximize search performance,
        // you can optionally call forceMerge here. This can be
        // a terribly costly operation, so generally it's only
        // worth it when your index is relatively static (ie
        // you're done adding documents to it):
        //
        // writer.forceMerge(1);

        writer.close();

        Date end = new Date();
        System.out.println(end.getTime() - start.getTime() + " total milliseconds");

    } catch (IOException e) {
        System.out.println(" caught a " + e.getClass() + "\n with message: " + e.getMessage());
    }
}

From source file:com.yangxu.searchengine.search.SearchFiles.java

License:Apache License

public void makeSearch(String field, String queries, int hitsPerPage) throws Exception {
    int repeat = 10;
    boolean raw = false;
    String queryString = null;/*from  w w  w .  ja  v a 2 s  . c  o m*/
    if (hitsPerPage <= 0) {
        System.err.println("There must be at least 1 hit per page.");
        System.exit(1);
    }

    IndexReader reader = IndexReader.open(FSDirectory.open(new File(indexPath)));
    IndexSearcher searcher = new IndexSearcher(reader);
    Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_31);

    //  
    BufferedReader in = null;
    if (queries != null) {
        in = new BufferedReader(new InputStreamReader(new FileInputStream(queries), "UTF-8"));
    } else {
        in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    }

    QueryParser parser = new QueryParser(Version.LUCENE_31, field, analyzer);
    while (true) {
        if (queries == null && queryString == null) { // prompt the user
            System.out.println("Enter query: ");
        }
        String line = queryString != null ? queryString : in.readLine();

        if (line == null || line.length() == -1) {
            break;
        }

        line = line.trim();
        if (line.length() == 0) {
            break;
        }

        Query query = parser.parse(line);
        System.out.println("Searching for: " + query.toString(field));

        if (repeat > 0) { // repeat & time as benchmark
            Date start = new Date();
            for (int i = 0; i < repeat; i++) {
                searcher.search(query, null, 100);
            }
            Date end = new Date();
            System.out.println("Time: " + (end.getTime() - start.getTime()) + "ms");
        }

        doPagingSearch(in, searcher, query, hitsPerPage, raw, queries == null && queryString == null);

        if (queryString != null) {
            break;
        }
    }
    searcher.close();
    reader.close();
}

From source file:fr.ericlab.sondy.core.access.IndexAccess.java

License:Open Source License

/**
 *
 * @param appVariables/*from   ww  w.j  ava  2  s .co  m*/
 */
public IndexAccess(AppVariables appVariables) {
    try {
        Analyzer analyzer;
        if (appVariables.stemmingLanguage.equalsIgnoreCase("Standard")) {
            analyzer = new StandardAnalyzer(Version.LUCENE_36);
        } else {
            if (appVariables.stemmingLanguage.equals("Chinese")) {
                analyzer = new SmartChineseAnalyzer(Version.LUCENE_36);
            } else {
                String packageName = appVariables.stemmingLanguage.substring(0, 2).toLowerCase();
                Class cl = Class.forName("org.apache.lucene.analysis." + packageName + "."
                        + appVariables.stemmingLanguage + "Analyzer");
                Class[] types = new Class[] { Version.class, Set.class };
                Constructor ct = cl.getConstructor(types);
                analyzer = (Analyzer) ct.newInstance(Version.LUCENE_36);
            }
        }
        FSDirectory indexDiscret = FSDirectory.open(new File(appVariables.configuration.getWorkspace()
                + "/datasets/" + appVariables.currentDatasetText.getText() + "/"
                + appVariables.getCurrentDatasetDiscretization()));
        IndexWriterConfig configDiscret = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        writer = new IndexWriter(indexDiscret, configDiscret);
        reader = IndexReader.open(writer, true);
    } catch (IOException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    } catch (ClassNotFoundException | NoSuchMethodException | SecurityException | InstantiationException
            | IllegalAccessException | IllegalArgumentException | InvocationTargetException ex) {
        Logger.getLogger(IndexAccess.class.getName()).log(Level.SEVERE, null, ex);
    }
}

From source file:fr.ericlab.sondy.core.access.MentionIndexAccess.java

License:Open Source License

/**
 *
 * @param appVariables/*from  ww  w.  j ava2s.co  m*/
 */
public MentionIndexAccess(AppVariables appVariables) {
    try {
        Analyzer analyzer;
        if (appVariables.stemmingLanguage.equalsIgnoreCase("Standard")) {
            analyzer = new StandardAnalyzer(Version.LUCENE_36);
        } else {
            if (appVariables.stemmingLanguage.equals("Chinese")) {
                analyzer = new SmartChineseAnalyzer(Version.LUCENE_36);
            } else {
                String packageName = appVariables.stemmingLanguage.substring(0, 2).toLowerCase();
                Class cl = Class.forName("org.apache.lucene.analysis." + packageName + "."
                        + appVariables.stemmingLanguage + "Analyzer");
                Class[] types = new Class[] { Version.class, Set.class };
                Constructor ct = cl.getConstructor(types);
                analyzer = (Analyzer) ct.newInstance(Version.LUCENE_36);
            }
        }
        FSDirectory indexDiscret = FSDirectory.open(new File(appVariables.configuration.getWorkspace()
                + "/datasets/" + appVariables.currentDatasetText.getText() + "/"
                + appVariables.getCurrentDatasetDiscretization() + "-m"));
        IndexWriterConfig configDiscret = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        mentionWriter = new IndexWriter(indexDiscret, configDiscret);
        mentionReader = IndexReader.open(mentionWriter, true);
    } catch (IOException ex) {
        Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex);
    } catch (ClassNotFoundException | NoSuchMethodException | SecurityException | InstantiationException
            | IllegalAccessException | IllegalArgumentException | InvocationTargetException ex) {
        Logger.getLogger(MentionIndexAccess.class.getName()).log(Level.SEVERE, null, ex);
    }
}