Example usage for org.apache.lucene.analysis TokenStream incrementToken

List of usage examples for org.apache.lucene.analysis TokenStream incrementToken

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream incrementToken.

Prototype

public abstract boolean incrementToken() throws IOException;

Source Link

Document

Consumers (i.e., IndexWriter ) use this method to advance the stream to the next token.

Usage

From source file:cl.usach.escalemania.sessionbeans.DocumentoFacade.java

public List<String> tokenizeString(Analyzer analyzer, String tweet) {

    List<String> result = new ArrayList<String>();
    try {/*from  w  w w  . j ava 2 s.c  o m*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(tweet));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:cn.com.szgao.enterprise.ExecutorsIndustry.java

@SuppressWarnings("rawtypes")
 private void readFileByLines(File file, int startNum) throws Exception {

     // ??//from   w  ww  .  j  a va2  s  . c  o  m
     int basicSum = 0;
     // ??
     int regSum = 0;

     // ???
     String batchNum = file.getName();
     // 
     StringBuffer sb = new StringBuffer();
     // FileReader fr = new FileReader(file);
     // int ch = 0;
     // while((ch = fr.read())!=-1 )
     // {
     // sb.append((char)ch);
     // }
     // fr.close();
     // fr = null;

     // BufferedWriter fw = null;
     String encoding_from = "UTF-8";// GB18030
     // String encoding_to = "UTF-8";
     BufferedReader reader = null;
     try {
         // ? GB18030

         // InputStreamReader isr = new InputStreamReader(new
         // FileInputStream(file), "UTF-8");
         // InputStreamReader isr = new InputStreamReader(new
         // FileInputStream(file), "GBK");
         InputStreamReader isr = new InputStreamReader(new FileInputStream(file), encoding_from);
         reader = new BufferedReader(isr);

     } catch (FileNotFoundException e1) {
         e1.printStackTrace();
     }

     String tempT = null;

     int readNum = 0;

     // E:\\data\\--0008.txt
     String ss = file.getPath().substring(file.getPath().indexOf("data") + 4);
     String ss2 = file.getPath().substring(file.getPath().indexOf("data") + 5, file.getPath().lastIndexOf("\\"));

     //      String folderPath = "D:/lm/log/?/" + ss2;
     String filePath = file.getPath().replace("???", "???industryId")
             .replace("E:", "D:");

     // 
     //      FileUtils.newFolder(folderPath);
     File fileS = new File(filePath);
     String encoding_from1 = "UTF-8";
     BufferedWriter fw = null;
     try {
         if (!fileS.exists()) {
             try {
                 fileS.createNewFile();
             } catch (IOException e) {
                 e.printStackTrace();
                 log.error(e);
             }
             fw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileS, true), encoding_from1)); // ????
         } else {
             fileS.delete();
             fileS = new File(filePath);
             fw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileS, true), encoding_from1)); // ????
         }
     } catch (FileNotFoundException e1) {
         e1.printStackTrace();
     }

     int M = 1;// 1M?
     while ((tempT = reader.readLine()) != null) {

         // tempT="";

         long size = tempT.getBytes().length;
         if (size > M * 1024 * 1024) {
             log.error("--------?1M :" + tempT.substring(0, 500));
             continue;
         }

         /**
          * 
          */
         JSONArray holderArray = null;
         /**
          * 
          */
         JSONArray holderDetArray = null;
         List<String> beforeList = new ArrayList<String>();
         List<String> afterList = new ArrayList<String>();

         EnterpriseVO enterVO = new EnterpriseVO();
         EnterpriseVO enterVOT = new EnterpriseVO();

         readNum++;

         // if(readNum<34431){
         // continue;
         // }
         // if(readNum==34431){
         //
         // writerString(fwUn, tempT);
         // break;
         // }

         if (readNum < startNum) {
             continue;
         }
         System.out.println("-->>>>>>> " + (readNum) + "---??" + Thread.currentThread().getName() + "---"
                 + file.getPath());
         if (tempT == null || tempT == "") {
             continue;
         }

         String doString = tempT.toString();
         JSONObject obj = null;
         // System.out.println("in..." + doString);
         try {
             enterVO = gs.fromJson(tempT, EnterpriseVO.class);
         } catch (Exception e) {
             e.printStackTrace();
             log.error(e);
             continue;
             // return;
         }

         String industry = null;
         // ?
         Analyzer anal = new IKAnalyzer();
         String indu = removeBlank(enterVO.getScope());
         IndustryVO ivo = null;
         IndustryVO ivo1 = null;
         if (StringUtils.isNull(industry)) {
             if (!StringUtils.isNull(indu)) {
                 String[] sourceStrArray = indu.split("[;.:,]");// ?
                 for (String str : sourceStrArray) {
                     // System.out.println("-- " + str);

                     ivo = getIndustry(str);
                     if (!StringUtils.isNull(ivo.getIndustry_id())) {
                         break;
                     }

                     // ik?
                     if (null == ivo.getIndustry_id()) {

                         StringReader reader1 = new StringReader(str);
                         // ?
                         TokenStream ts = null;
                         ts = anal.tokenStream("", reader1);
                         CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
                         try {
                             ts.reset();

                             String strs = "";
                             // ????
                             while (ts.incrementToken()) {
                                 String temp = term.toString();
                                 if (!StringUtils.isNull(temp)) {
                                     // getIndustry(temp);
                                     strs += term.toString() + "|";
                                     // System.out.print(term.toString() + "|" );
                                 }
                             }
                             reader1.close();

                             String[] arrStr1 = strs.split("\\|");
                             StringUtils.sortStringArray(arrStr1, 1);// ?
                             List<IndustryVO> listiv = new ArrayList<IndustryVO>();
                             List<IndustryVO> listiv_v = new ArrayList<IndustryVO>();
                             List<IndustryVO> listiv_n = new ArrayList<IndustryVO>();

                             for (int i = 0; i < arrStr1.length; i++) {
                                 String temp = arrStr1[i];
                                 if (!StringUtils.isNull(temp)) {
                                     ivo1 = getIndustry(temp);
                                     if (!StringUtils.isNull(ivo1.getIndustry_id())) {
                                         listiv.add(ivo1);
                                         // break;
                                     }
                                 }
                             }
                             // ?????
                             if (listiv.size() > 0) {
                                 for (IndustryVO industryVO : listiv) {
                                     if ("V".equals(industryVO.getFlag())) {
                                         listiv_v.add(industryVO);
                                     } else {
                                         listiv_n.add(industryVO);
                                     }
                                 }
                             }
                             if (listiv_v.size() > 0) {
                                 ivo = getfirstSortStringArray(listiv_v, 1);
                                 break;
                             }
                             if (listiv_n.size() > 0) {
                                 ivo = getfirstSortStringArray(listiv_n, 1);
                                 break;
                             }

                         } catch (IOException e) {
                             e.printStackTrace();
                         }
                     }

                     if (!StringUtils.isNull(ivo.getIndustry_id())) {
                         break;
                     }
                 }
             }
         }
         if (null != ivo) {
             enterVO.setIndustry(ivo.getIndustry_name());
             enterVO.setIndustryId(ivo.getIndustry_id());
         }
         // System.out.println("in..." + obj);

         writerString(fw, StringUtils.GSON.toJson(enterVO));
     }
     log.info("?regSum: " + regSum + " ?basicSum: " + basicSum + " readNum: "
             + readNum + " -: " + (readNum - basicSum - regSum) + "---??" + file.getPath());
 }

From source file:cn.com.szgao.enterprise.ExecutorsIndustry.java

/**
  * //  www. j  a  v a  2  s  .c o m
  * 
  * @param enterVO
  * @return
  */
 public static EnterpriseVO washEnterpriseVO(EnterpriseVO enterVO) throws IOException {

     if (!StringUtils.isNull(enterVO.getCompany())) {
         enterVO.setCompany(deleteMoreFuhao(enterVO.getCompany()));
     }

     // ??
     String scope = enterVO.getScope();
     if (!StringUtils.isNull(scope)) {
         scope = scope.replaceAll("[&nbsp;\r\t^]", "");
         if (scope.length() > 5) {
             String temp = "";
             // System.out.println(scope);
             // if (scope.substring(scope.length() - 5).contains("^")) {
             // temp = scope.substring(scope.length() - 5).replace("^", "");
             // scope = scope.substring(0, scope.length() - 5) + temp;
             // } else {
             temp = scope.substring(scope.length() - 5).replaceAll("[#*+]", "");
             scope = scope.substring(0, scope.length() - 5) + temp;
             // }
         }
         enterVO.setScope(scope);
     }
     String regCapital = enterVO.getRegCapital();
     // ? &nbsp
     if (!StringUtils.isNull(regCapital)) {

         String capital = enterVO.getRegCapital().replaceAll("[&nbsp;\r\t\n]", "");
         // enterVO.setRegCapital(capital);

         enterVO.setRegCapitalO(capital);

         Double regCapitalN = getBankingDouble(capital);
         if (null != regCapitalN) {
             enterVO.setRegCapitalN(regCapitalN);
             enterVO.setRegCapital(null);

             if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("?");
             } else if (regCapital.contains("?")) {
                 enterVO.setUnit("??");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("?")) {
                 enterVO.setUnit("?");
             } else if (regCapital.contains("?")) {
                 enterVO.setUnit("?");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("?")) {
                 enterVO.setUnit("?");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             }

             else {
                 enterVO.setUnit("?");
             }
         }

     }

     // ?
     String industry = null;
     String industryId = null;

     /*
      * if (!StringUtils.isNull(enterVO.getCompany())) { String key =
      * StringUtils.NBG.generate(enterVO.getCompany()).toString();
      * JsonDocument queryDoc = ExecutorsText.bucket.get(key, 60,
      * TimeUnit.MINUTES);
      * 
      * if (null == queryDoc) { // System.out.println(
      * "NULL queryDoc-------------------- " + // key); //
      * System.out.println("NULL queryDoc-------------------- " + // key);
      * 
      * } else { BusinessDirectoryVO vo =
      * StringUtils.GSON.fromJson(queryDoc.content().toString(),
      * BusinessDirectoryVO.class); if
      * (!StringUtils.isNull(vo.getIndustry())) { industry =
      * vo.getIndustry(); } } }
      */

     // ?
     Analyzer anal = new IKAnalyzer();
     String indu = removeBlank(enterVO.getScope());
     IndustryVO ivo = null;
     IndustryVO ivo1 = null;
     if (StringUtils.isNull(industry)) {
         if (!StringUtils.isNull(indu)) {
             String[] sourceStrArray = indu.split("[;.:,]");// ?
             for (String str : sourceStrArray) {
                 // System.out.println("-- " + str);

                 ivo = getIndustry(str);
                 if (!StringUtils.isNull(ivo.getIndustry_id())) {
                     break;
                 }

                 // ik?
                 if (null == ivo.getIndustry_id()) {

                     StringReader reader = new StringReader(str);
                     // ?
                     TokenStream ts = null;
                     ts = anal.tokenStream("", reader);
                     CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
                     try {
                         ts.reset();

                         String strs = "";
                         // ????
                         while (ts.incrementToken()) {
                             String temp = term.toString();
                             if (!StringUtils.isNull(temp)) {
                                 // getIndustry(temp);
                                 strs += term.toString() + "|";
                                 // System.out.print(term.toString() + "|" );
                             }
                         }
                         reader.close();

                         String[] arrStr1 = strs.split("\\|");
                         StringUtils.sortStringArray(arrStr1, 1);// ?
                         List<IndustryVO> listiv = new ArrayList<IndustryVO>();
                         List<IndustryVO> listiv_v = new ArrayList<IndustryVO>();
                         List<IndustryVO> listiv_n = new ArrayList<IndustryVO>();

                         for (int i = 0; i < arrStr1.length; i++) {
                             String temp = arrStr1[i];
                             if (!StringUtils.isNull(temp)) {
                                 ivo1 = getIndustry(temp);
                                 if (!StringUtils.isNull(ivo1.getIndustry_id())) {
                                     listiv.add(ivo1);
                                     // break;
                                 }
                             }
                         }
                         // ?????
                         if (listiv.size() > 0) {
                             for (IndustryVO industryVO : listiv) {
                                 if ("V".equals(industryVO.getFlag())) {
                                     listiv_v.add(industryVO);
                                 } else {
                                     listiv_n.add(industryVO);
                                 }
                             }
                         }
                         if (listiv_v.size() > 0) {
                             ivo = getfirstSortStringArray(listiv_v, 1);
                             break;
                         }
                         if (listiv_n.size() > 0) {
                             ivo = getfirstSortStringArray(listiv_n, 1);
                             break;
                         }

                     } catch (IOException e) {
                         e.printStackTrace();
                     }
                 }

                 if (!StringUtils.isNull(ivo.getIndustry_id())) {
                     break;
                 }
             }
         }
     }
     if (null != ivo) {
         enterVO.setIndustry(ivo.getIndustry_name());
         enterVO.setIndustryId(ivo.getIndustry_id());
     }

     if (!StringUtils.isNull(enterVO.getRegNum())) {

         if (enterVO.getRegNum().indexOf("\u0000") != -1) {
             enterVO.setRegNum(enterVO.getRegNum().substring(0, enterVO.getRegNum().indexOf("\u0000")));
         }
     }

     enterVO.setHolder(WashEtp.clearHolder(enterVO));// 
     enterVO.setPunishment(WashEtp.clearPunishment(enterVO));// ?
     enterVO.setChange(WashEtp.clearChangeItem(enterVO));// ?
     enterVO.setReport(WashEtp.clearReport(enterVO));// 

     return enterVO;
 }

From source file:cn.com.szgao.enterprise.FileIntoDataBase2p5.java

/**
  * ik??//www. j  a  va  2 s .  co  m
  * 
  * @param str
  * @return
  * @return IndustryVO
  * @author liuming
  * @date 2016629 ?2:36:09
  */
 public static IndustryVO getIndustryVOFromIk(String str) {
     // ?
     Analyzer anal = new IKAnalyzer();
     IndustryVO ivo = new IndustryVO();
     IndustryVO ivo1 = new IndustryVO();

     StringReader reader = new StringReader(str);
     // ?
     TokenStream ts = null;
     ts = anal.tokenStream("", reader);
     CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
     try {
         ts.reset();

         String strs = "";
         // ????
         while (ts.incrementToken()) {
             String temp = term.toString();
             if (!StringUtils.isNull(temp)) {
                 // getIndustry(temp);
                 strs += term.toString() + "|";
                 // System.out.print(term.toString() + "|" );
             }
         }

         reader.close();

         String[] arrStr1 = strs.split("\\|");
         StringUtils.sortStringArray(arrStr1, 1);// ?
         List<IndustryVO> listiv = new ArrayList<IndustryVO>();
         List<IndustryVO> listiv_v = new ArrayList<IndustryVO>();
         List<IndustryVO> listiv_n = new ArrayList<IndustryVO>();

         for (int i = 0; i < arrStr1.length; i++) {
             String temp = arrStr1[i];
             if (!StringUtils.isNull(temp)) {
                 ivo1 = getIndustry(temp);
                 if (!StringUtils.isNull(ivo1.getIndustry_id())) {
                     listiv.add(ivo1);
                     // break;
                 }
             }
         }
         // ?????
         // if (listiv.size() > 0) {
         // for (IndustryVO industryVO : listiv) {
         // if ("V".equals(industryVO.getFlag())) {
         // listiv_v.add(industryVO);
         // } else {
         // listiv_n.add(industryVO);
         // }
         // }
         // }

         // ???
         if (listiv.size() > 0) {
             ivo = getfirstSortStringArray(listiv, 1);
         }

         // if (listiv_v.size() > 0) {
         // ivo = getfirstSortStringArray(listiv_v, 1);
         // }
         // if (listiv_n.size() > 0) {
         // ivo = getfirstSortStringArray(listiv_n, 1);
         // }

     } catch (IOException e) {
         e.printStackTrace();
     }
     return ivo;
 }

From source file:cn.com.szgao.enterprise.FileIntoDataBase2p5WashJson.java

/**
  * ik??//  w  ww  . j  a  va  2  s  . c  o  m
  * 
  * @param str
  * @return
  * @return IndustryVO
  * @author liuming
  * @date 2016629 ?2:36:09
  */
 public static IndustryVO getIndustryVOFromIk(String str) {
     // ?
     Analyzer anal = new IKAnalyzer();
     IndustryVO ivo = new IndustryVO();
     IndustryVO ivo1 = new IndustryVO();

     StringReader reader = new StringReader(str);
     // ?
     TokenStream ts = null;
     ts = anal.tokenStream("", reader);
     CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
     try {
         ts.reset();

         String strs = "";
         // ????
         while (ts.incrementToken()) {
             String temp = term.toString();
             if (!StringUtils.isNull(temp)) {
                 // getIndustry(temp);
                 strs += term.toString() + "|";
                 // System.out.print(term.toString() + "|" );
             }
         }
         reader.close();

         String[] arrStr1 = strs.split("\\|");
         StringUtils.sortStringArray(arrStr1, 1);// ?
         List<IndustryVO> listiv = new ArrayList<IndustryVO>();
         List<IndustryVO> listiv_v = new ArrayList<IndustryVO>();
         List<IndustryVO> listiv_n = new ArrayList<IndustryVO>();

         for (int i = 0; i < arrStr1.length; i++) {
             String temp = arrStr1[i];
             if (!StringUtils.isNull(temp)) {
                 ivo1 = getIndustry(temp);
                 if (!StringUtils.isNull(ivo1.getIndustry_id())) {
                     listiv.add(ivo1);
                     // break;
                 }
             }
         }
         // ?????
         // if (listiv.size() > 0) {
         // for (IndustryVO industryVO : listiv) {
         // if ("V".equals(industryVO.getFlag())) {
         // listiv_v.add(industryVO);
         // } else {
         // listiv_n.add(industryVO);
         // }
         // }
         // }

         // ???
         if (listiv.size() > 0) {
             ivo = getfirstSortStringArray(listiv, 1);
         }

         // if (listiv_v.size() > 0) {
         // ivo = getfirstSortStringArray(listiv_v, 1);
         // }
         // if (listiv_n.size() > 0) {
         // ivo = getfirstSortStringArray(listiv_n, 1);
         // }

     } catch (IOException e) {
         e.printStackTrace();
     }
     return ivo;
 }

From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java

License:Open Source License

public boolean contain(String label) {
    try {/*from   www.jav  a 2  s .com*/
        IndexReader reader = IndexReader.open(this.indexDir, true);
        Searcher searcher = new IndexSearcher(reader);
        // use the boolean query
        HashSet<String> queryTermSet = new HashSet<String>();
        TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label));
        TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            queryTermSet.add(termAtt.term());
        }
        stream.end();
        stream.close();

        // construct the query
        BooleanQuery bq = new BooleanQuery();
        Iterator<String> it = queryTermSet.iterator();
        while (it.hasNext()) {
            String s = it.next();
            Term term = new Term(LabelDocument.FIELD_LABEL, s);
            TermQuery termQuery = new TermQuery(term);
            bq.add(termQuery, Occur.MUST);
        }

        ExactLabelQueryResultCollector collector = new ExactLabelQueryResultCollector(reader, label);
        searcher.search(bq, collector);
        boolean ret = collector.isExistQueryLabel();
        reader.close();
        return ret;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return false;
}

From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java

License:Open Source License

public TreeSet<SimilarLabelQueryResult> getSimilarLabels(String query, float similarity) {
    TreeSet<SimilarLabelQueryResult> ret = new TreeSet<SimilarLabelQueryResult>();
    if (query == null) {
        ret.add(new SimilarLabelQueryResult(null, 1));
        return ret;
    }// www  .  j a  va  2s .  com
    try {
        IndexReader reader = IndexReader.open(this.indexDir, true);
        Searcher searcher = new IndexSearcher(reader);

        // get terms from query
        HashSet<String> queryTermSet = new HashSet<String>();
        TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(query));
        TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            queryTermSet.add(termAtt.term());
        }
        stream.end();
        stream.close();

        // construct the query
        BooleanQuery bq = new BooleanQuery();
        Iterator<String> it = queryTermSet.iterator();
        SynonymMap synMap = SynonymIndex.getSynonymMap();
        HashSet<String> expandedQueryTermSet = new HashSet<String>(queryTermSet);

        while (it.hasNext()) {
            String s = it.next();
            Term term = new Term(LabelDocument.FIELD_LABEL, s);
            TermQuery termQuery = new TermQuery(term);
            bq.add(termQuery, Occur.SHOULD);
            // expand using synonyms
            for (String syn : synMap.getSynonyms(s)) {
                stemer.setCurrent(syn);
                stemer.stem();
                syn = stemer.getCurrent();
                if (expandedQueryTermSet.add(syn)) {
                    term = new Term(LabelDocument.FIELD_LABEL, syn);
                    termQuery = new TermQuery(term);
                    bq.add(termQuery, Occur.SHOULD);
                }
            }
        }

        // search in the label index
        SimilarLabelQueryResultCollector collector = new SimilarLabelQueryResultCollector(reader, queryTermSet,
                similarity);
        searcher.search(bq, collector);
        ret = collector.getQueryResult();
        searcher.close();
        reader.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return ret;
}

From source file:cn.edu.thss.iise.beehivez.server.index.luceneindex.analyzer.SemicolonAnalyzer.java

License:Open Source License

/**
 * @param args/*from  w w w .ja  v  a2  s  . c om*/
 */
public static void main(String[] args) throws IOException {
    // text to tokenize
    final String text = "This is a demo of , the new TokenStream API";

    SemicolonAnalyzer analyzer = new SemicolonAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    // get the TermAttribute from the TokenStream
    TermAttribute termAtt = stream.addAttribute(TermAttribute.class);

    stream.reset();

    // print all tokens until stream is exhausted
    while (stream.incrementToken()) {
        System.out.println(termAtt.term());
    }

    stream.end();
    stream.close();

}

From source file:cn.edu.thss.iise.beehivez.server.util.StringSimilarityUtil.java

License:Open Source License

/**
 * tokenize the given string, all the words are extracted, lowercased, all
 * the stop words are removed, and all the words are replaced with their
 * stem/*from w w w.j  av  a 2s  .com*/
 * 
 * @param label
 * @return
 */
public static HashSet<String> snowballTokenize(String label) {
    HashSet<String> ret = new HashSet<String>();
    try {
        Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English",
                StandardAnalyzer.STOP_WORDS_SET);

        TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label));
        TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            ret.add(termAtt.term());
        }
        stream.end();
        stream.close();
    } catch (Exception e) {
        e.printStackTrace();
    }

    return ret;
}

From source file:cn.jcenterhome.web.action.CpAction.java

private List<String> getKeyWord(String text) throws IOException {
    List<String> keywords = new ArrayList<String>();
    if (!Common.empty(text)) {
        Map<String, Integer> words = new HashMap<String, Integer>();
        Analyzer analyzer = new IKAnalyzer(true);
        StringReader reader = new StringReader(text);
        TokenStream tokenStream = analyzer.tokenStream("*", reader);
        TermAttribute termAtt = (TermAttribute) tokenStream.getAttribute(TermAttribute.class);
        while (tokenStream.incrementToken()) {
            String word = termAtt.term();
            if (word.length() > 1 && Common.strlen(word) > 2) {
                Integer count = words.get(word);
                if (count == null) {
                    count = 0;//  w w  w.  jav  a 2s .c o  m
                }
                words.put(word, count + 1);
            }
        }
        if (words.size() > 0) {
            Directory dir = null;
            IndexSearcher searcher = null;
            try {
                String fieldName = "text";
                dir = new RAMDirectory();
                IndexWriter writer = new IndexWriter(dir, analyzer, true, IndexWriter.MaxFieldLength.LIMITED);
                Document doc = new Document();
                doc.add(new Field(fieldName, text, Field.Store.YES, Field.Index.ANALYZED));
                writer.addDocument(doc);
                writer.close();
                searcher = new IndexSearcher(dir);
                searcher.setSimilarity(new IKSimilarity());
                Set<String> keys = words.keySet();
                Map<String, Float> temps = new HashMap<String, Float>();
                for (String key : keys) {
                    int count = words.get(key);
                    Query query = IKQueryParser.parse(fieldName, key);
                    TopDocs topDocs = searcher.search(query, 1);
                    if (topDocs.totalHits > 0) {
                        temps.put(key, topDocs.getMaxScore() * count);
                    }
                }
                Entry<String, Float>[] keywordEntry = getSortedHashtableByValue(temps);
                for (Entry<String, Float> entry : keywordEntry) {
                    if (keywords.size() < 5) {
                        keywords.add(entry.getKey());
                    }
                }
            } catch (Exception e) {
                e.printStackTrace();
            } finally {
                try {
                    searcher.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
                try {
                    dir.close();
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
    }
    return keywords;
}