Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:cn.com.szgao.enterprise.ExecutorsIndustry.java

/**
  * //from   w  w w  .j  av  a 2 s.  c  o m
  * 
  * @param enterVO
  * @return
  */
 public static EnterpriseVO washEnterpriseVO(EnterpriseVO enterVO) throws IOException {

     if (!StringUtils.isNull(enterVO.getCompany())) {
         enterVO.setCompany(deleteMoreFuhao(enterVO.getCompany()));
     }

     // ??
     String scope = enterVO.getScope();
     if (!StringUtils.isNull(scope)) {
         scope = scope.replaceAll("[ \r\t^]", "");
         if (scope.length() > 5) {
             String temp = "";
             // System.out.println(scope);
             // if (scope.substring(scope.length() - 5).contains("^")) {
             // temp = scope.substring(scope.length() - 5).replace("^", "");
             // scope = scope.substring(0, scope.length() - 5) + temp;
             // } else {
             temp = scope.substring(scope.length() - 5).replaceAll("[#*+]", "");
             scope = scope.substring(0, scope.length() - 5) + temp;
             // }
         }
         enterVO.setScope(scope);
     }
     String regCapital = enterVO.getRegCapital();
     // ? &nbsp
     if (!StringUtils.isNull(regCapital)) {

         String capital = enterVO.getRegCapital().replaceAll("[ \r\t\n]", "");
         // enterVO.setRegCapital(capital);

         enterVO.setRegCapitalO(capital);

         Double regCapitalN = getBankingDouble(capital);
         if (null != regCapitalN) {
             enterVO.setRegCapitalN(regCapitalN);
             enterVO.setRegCapital(null);

             if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("?");
             } else if (regCapital.contains("?")) {
                 enterVO.setUnit("??");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("?")) {
                 enterVO.setUnit("?");
             } else if (regCapital.contains("?")) {
                 enterVO.setUnit("?");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("?")) {
                 enterVO.setUnit("?");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             } else if (regCapital.contains("")) {
                 enterVO.setUnit("");
             }

             else {
                 enterVO.setUnit("?");
             }
         }

     }

     // ?
     String industry = null;
     String industryId = null;

     /*
      * if (!StringUtils.isNull(enterVO.getCompany())) { String key =
      * StringUtils.NBG.generate(enterVO.getCompany()).toString();
      * JsonDocument queryDoc = ExecutorsText.bucket.get(key, 60,
      * TimeUnit.MINUTES);
      * 
      * if (null == queryDoc) { // System.out.println(
      * "NULL queryDoc-------------------- " + // key); //
      * System.out.println("NULL queryDoc-------------------- " + // key);
      * 
      * } else { BusinessDirectoryVO vo =
      * StringUtils.GSON.fromJson(queryDoc.content().toString(),
      * BusinessDirectoryVO.class); if
      * (!StringUtils.isNull(vo.getIndustry())) { industry =
      * vo.getIndustry(); } } }
      */

     // ?
     Analyzer anal = new IKAnalyzer();
     String indu = removeBlank(enterVO.getScope());
     IndustryVO ivo = null;
     IndustryVO ivo1 = null;
     if (StringUtils.isNull(industry)) {
         if (!StringUtils.isNull(indu)) {
             String[] sourceStrArray = indu.split("[;.:,]");// ?
             for (String str : sourceStrArray) {
                 // System.out.println("-- " + str);

                 ivo = getIndustry(str);
                 if (!StringUtils.isNull(ivo.getIndustry_id())) {
                     break;
                 }

                 // ik?
                 if (null == ivo.getIndustry_id()) {

                     StringReader reader = new StringReader(str);
                     // ?
                     TokenStream ts = null;
                     ts = anal.tokenStream("", reader);
                     CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
                     try {
                         ts.reset();

                         String strs = "";
                         // ????
                         while (ts.incrementToken()) {
                             String temp = term.toString();
                             if (!StringUtils.isNull(temp)) {
                                 // getIndustry(temp);
                                 strs += term.toString() + "|";
                                 // System.out.print(term.toString() + "|" );
                             }
                         }
                         reader.close();

                         String[] arrStr1 = strs.split("\\|");
                         StringUtils.sortStringArray(arrStr1, 1);// ?
                         List<IndustryVO> listiv = new ArrayList<IndustryVO>();
                         List<IndustryVO> listiv_v = new ArrayList<IndustryVO>();
                         List<IndustryVO> listiv_n = new ArrayList<IndustryVO>();

                         for (int i = 0; i < arrStr1.length; i++) {
                             String temp = arrStr1[i];
                             if (!StringUtils.isNull(temp)) {
                                 ivo1 = getIndustry(temp);
                                 if (!StringUtils.isNull(ivo1.getIndustry_id())) {
                                     listiv.add(ivo1);
                                     // break;
                                 }
                             }
                         }
                         // ?????
                         if (listiv.size() > 0) {
                             for (IndustryVO industryVO : listiv) {
                                 if ("V".equals(industryVO.getFlag())) {
                                     listiv_v.add(industryVO);
                                 } else {
                                     listiv_n.add(industryVO);
                                 }
                             }
                         }
                         if (listiv_v.size() > 0) {
                             ivo = getfirstSortStringArray(listiv_v, 1);
                             break;
                         }
                         if (listiv_n.size() > 0) {
                             ivo = getfirstSortStringArray(listiv_n, 1);
                             break;
                         }

                     } catch (IOException e) {
                         e.printStackTrace();
                     }
                 }

                 if (!StringUtils.isNull(ivo.getIndustry_id())) {
                     break;
                 }
             }
         }
     }
     if (null != ivo) {
         enterVO.setIndustry(ivo.getIndustry_name());
         enterVO.setIndustryId(ivo.getIndustry_id());
     }

     if (!StringUtils.isNull(enterVO.getRegNum())) {

         if (enterVO.getRegNum().indexOf("\u0000") != -1) {
             enterVO.setRegNum(enterVO.getRegNum().substring(0, enterVO.getRegNum().indexOf("\u0000")));
         }
     }

     enterVO.setHolder(WashEtp.clearHolder(enterVO));// 
     enterVO.setPunishment(WashEtp.clearPunishment(enterVO));// ?
     enterVO.setChange(WashEtp.clearChangeItem(enterVO));// ?
     enterVO.setReport(WashEtp.clearReport(enterVO));// 

     return enterVO;
 }

From source file:cn.com.szgao.enterprise.FileIntoDataBase2p5.java

/**
  * ik??//w w  w . j a  v a 2  s .co m
  * 
  * @param str
  * @return
  * @return IndustryVO
  * @author liuming
  * @date 2016629 ?2:36:09
  */
 public static IndustryVO getIndustryVOFromIk(String str) {
     // ?
     Analyzer anal = new IKAnalyzer();
     IndustryVO ivo = new IndustryVO();
     IndustryVO ivo1 = new IndustryVO();

     StringReader reader = new StringReader(str);
     // ?
     TokenStream ts = null;
     ts = anal.tokenStream("", reader);
     CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
     try {
         ts.reset();

         String strs = "";
         // ????
         while (ts.incrementToken()) {
             String temp = term.toString();
             if (!StringUtils.isNull(temp)) {
                 // getIndustry(temp);
                 strs += term.toString() + "|";
                 // System.out.print(term.toString() + "|" );
             }
         }

         reader.close();

         String[] arrStr1 = strs.split("\\|");
         StringUtils.sortStringArray(arrStr1, 1);// ?
         List<IndustryVO> listiv = new ArrayList<IndustryVO>();
         List<IndustryVO> listiv_v = new ArrayList<IndustryVO>();
         List<IndustryVO> listiv_n = new ArrayList<IndustryVO>();

         for (int i = 0; i < arrStr1.length; i++) {
             String temp = arrStr1[i];
             if (!StringUtils.isNull(temp)) {
                 ivo1 = getIndustry(temp);
                 if (!StringUtils.isNull(ivo1.getIndustry_id())) {
                     listiv.add(ivo1);
                     // break;
                 }
             }
         }
         // ?????
         // if (listiv.size() > 0) {
         // for (IndustryVO industryVO : listiv) {
         // if ("V".equals(industryVO.getFlag())) {
         // listiv_v.add(industryVO);
         // } else {
         // listiv_n.add(industryVO);
         // }
         // }
         // }

         // ???
         if (listiv.size() > 0) {
             ivo = getfirstSortStringArray(listiv, 1);
         }

         // if (listiv_v.size() > 0) {
         // ivo = getfirstSortStringArray(listiv_v, 1);
         // }
         // if (listiv_n.size() > 0) {
         // ivo = getfirstSortStringArray(listiv_n, 1);
         // }

     } catch (IOException e) {
         e.printStackTrace();
     }
     return ivo;
 }

From source file:cn.com.szgao.enterprise.FileIntoDataBase2p5WashJson.java

/**
  * ik??/*  w  ww . j  ava  2  s.  co m*/
  * 
  * @param str
  * @return
  * @return IndustryVO
  * @author liuming
  * @date 2016629 ?2:36:09
  */
 public static IndustryVO getIndustryVOFromIk(String str) {
     // ?
     Analyzer anal = new IKAnalyzer();
     IndustryVO ivo = new IndustryVO();
     IndustryVO ivo1 = new IndustryVO();

     StringReader reader = new StringReader(str);
     // ?
     TokenStream ts = null;
     ts = anal.tokenStream("", reader);
     CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
     try {
         ts.reset();

         String strs = "";
         // ????
         while (ts.incrementToken()) {
             String temp = term.toString();
             if (!StringUtils.isNull(temp)) {
                 // getIndustry(temp);
                 strs += term.toString() + "|";
                 // System.out.print(term.toString() + "|" );
             }
         }
         reader.close();

         String[] arrStr1 = strs.split("\\|");
         StringUtils.sortStringArray(arrStr1, 1);// ?
         List<IndustryVO> listiv = new ArrayList<IndustryVO>();
         List<IndustryVO> listiv_v = new ArrayList<IndustryVO>();
         List<IndustryVO> listiv_n = new ArrayList<IndustryVO>();

         for (int i = 0; i < arrStr1.length; i++) {
             String temp = arrStr1[i];
             if (!StringUtils.isNull(temp)) {
                 ivo1 = getIndustry(temp);
                 if (!StringUtils.isNull(ivo1.getIndustry_id())) {
                     listiv.add(ivo1);
                     // break;
                 }
             }
         }
         // ?????
         // if (listiv.size() > 0) {
         // for (IndustryVO industryVO : listiv) {
         // if ("V".equals(industryVO.getFlag())) {
         // listiv_v.add(industryVO);
         // } else {
         // listiv_n.add(industryVO);
         // }
         // }
         // }

         // ???
         if (listiv.size() > 0) {
             ivo = getfirstSortStringArray(listiv, 1);
         }

         // if (listiv_v.size() > 0) {
         // ivo = getfirstSortStringArray(listiv_v, 1);
         // }
         // if (listiv_n.size() > 0) {
         // ivo = getfirstSortStringArray(listiv_n, 1);
         // }

     } catch (IOException e) {
         e.printStackTrace();
     }
     return ivo;
 }

From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java

License:Open Source License

public boolean contain(String label) {
    try {//  ww  w .  j  av  a 2 s.  co m
        IndexReader reader = IndexReader.open(this.indexDir, true);
        Searcher searcher = new IndexSearcher(reader);
        // use the boolean query
        HashSet<String> queryTermSet = new HashSet<String>();
        TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label));
        TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            queryTermSet.add(termAtt.term());
        }
        stream.end();
        stream.close();

        // construct the query
        BooleanQuery bq = new BooleanQuery();
        Iterator<String> it = queryTermSet.iterator();
        while (it.hasNext()) {
            String s = it.next();
            Term term = new Term(LabelDocument.FIELD_LABEL, s);
            TermQuery termQuery = new TermQuery(term);
            bq.add(termQuery, Occur.MUST);
        }

        ExactLabelQueryResultCollector collector = new ExactLabelQueryResultCollector(reader, label);
        searcher.search(bq, collector);
        boolean ret = collector.isExistQueryLabel();
        reader.close();
        return ret;
    } catch (Exception e) {
        e.printStackTrace();
    }
    return false;
}

From source file:cn.edu.thss.iise.beehivez.server.index.labelindex.LabelLuceneIndex.java

License:Open Source License

public TreeSet<SimilarLabelQueryResult> getSimilarLabels(String query, float similarity) {
    TreeSet<SimilarLabelQueryResult> ret = new TreeSet<SimilarLabelQueryResult>();
    if (query == null) {
        ret.add(new SimilarLabelQueryResult(null, 1));
        return ret;
    }/*from  www  .  j av  a2s  .  c  o m*/
    try {
        IndexReader reader = IndexReader.open(this.indexDir, true);
        Searcher searcher = new IndexSearcher(reader);

        // get terms from query
        HashSet<String> queryTermSet = new HashSet<String>();
        TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(query));
        TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            queryTermSet.add(termAtt.term());
        }
        stream.end();
        stream.close();

        // construct the query
        BooleanQuery bq = new BooleanQuery();
        Iterator<String> it = queryTermSet.iterator();
        SynonymMap synMap = SynonymIndex.getSynonymMap();
        HashSet<String> expandedQueryTermSet = new HashSet<String>(queryTermSet);

        while (it.hasNext()) {
            String s = it.next();
            Term term = new Term(LabelDocument.FIELD_LABEL, s);
            TermQuery termQuery = new TermQuery(term);
            bq.add(termQuery, Occur.SHOULD);
            // expand using synonyms
            for (String syn : synMap.getSynonyms(s)) {
                stemer.setCurrent(syn);
                stemer.stem();
                syn = stemer.getCurrent();
                if (expandedQueryTermSet.add(syn)) {
                    term = new Term(LabelDocument.FIELD_LABEL, syn);
                    termQuery = new TermQuery(term);
                    bq.add(termQuery, Occur.SHOULD);
                }
            }
        }

        // search in the label index
        SimilarLabelQueryResultCollector collector = new SimilarLabelQueryResultCollector(reader, queryTermSet,
                similarity);
        searcher.search(bq, collector);
        ret = collector.getQueryResult();
        searcher.close();
        reader.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return ret;
}

From source file:cn.edu.thss.iise.beehivez.server.index.luceneindex.analyzer.SemicolonAnalyzer.java

License:Open Source License

/**
 * @param args//from  w w  w  . j  a  v a  2  s.  c o m
 */
public static void main(String[] args) throws IOException {
    // text to tokenize
    final String text = "This is a demo of , the new TokenStream API";

    SemicolonAnalyzer analyzer = new SemicolonAnalyzer();
    TokenStream stream = analyzer.tokenStream("field", new StringReader(text));

    // get the TermAttribute from the TokenStream
    TermAttribute termAtt = stream.addAttribute(TermAttribute.class);

    stream.reset();

    // print all tokens until stream is exhausted
    while (stream.incrementToken()) {
        System.out.println(termAtt.term());
    }

    stream.end();
    stream.close();

}

From source file:cn.edu.thss.iise.beehivez.server.util.StringSimilarityUtil.java

License:Open Source License

/**
 * tokenize the given string, all the words are extracted, lowercased, all
 * the stop words are removed, and all the words are replaced with their
 * stem//  ww  w.  ja  va  2 s  .  co  m
 * 
 * @param label
 * @return
 */
public static HashSet<String> snowballTokenize(String label) {
    HashSet<String> ret = new HashSet<String>();
    try {
        Analyzer analyzer = new SnowballAnalyzer(Version.LUCENE_CURRENT, "English",
                StandardAnalyzer.STOP_WORDS_SET);

        TokenStream stream = analyzer.tokenStream(LabelDocument.FIELD_LABEL, new StringReader(label));
        TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            ret.add(termAtt.term());
        }
        stream.end();
        stream.close();
    } catch (Exception e) {
        e.printStackTrace();
    }

    return ret;
}

From source file:com.antsdb.saltedfish.sql.vdm.LuceneUtil.java

License:Open Source License

static void tokenize(String text, BiConsumer<String, String> lambda) {
    try (StandardAnalyzer analyzer = new StandardAnalyzer()) {
        TokenStream stream = analyzer.tokenStream("", text);
        CharTermAttribute term = stream.getAttribute(CharTermAttribute.class);
        TypeAttribute type = stream.getAttribute(TypeAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            lambda.accept(type.type(), term.toString());
        }//from  w w  w . j  ava2 s .c  o m
    } catch (IOException x) {
        throw new RuntimeException(x);
    }
}

From source file:com.b2international.index.compat.Highlighting.java

License:Apache License

/**
 * Splits a string to a list of tokens using the specified Lucene analyzer.
 * /*  ww  w.ja v a 2s .  c  o m*/
 * @param analyzer the analyzer determining token boundaries (may not be {@code null})
 * @param s the string to split
 * @return a list of tokens, or an empty list if {@code s} is {@code null} or empty
 */
public static List<String> split(Analyzer analyzer, final String s) {

    checkNotNull(analyzer, "analyzer");

    if (Strings.isNullOrEmpty(s)) {
        return ImmutableList.of();
    }

    final List<String> tokens = Lists.newArrayList();
    TokenStream stream = null;

    try {

        stream = analyzer.tokenStream(null, new StringReader(s));
        stream.reset();

        while (stream.incrementToken()) {
            tokens.add(stream.getAttribute(CharTermAttribute.class).toString());
        }

    } catch (final IOException ignored) {
        // Should not be thrown when using a string reader
    } finally {
        endAndCloseQuietly(stream);
    }

    return tokens;
}

From source file:com.basistech.elasticsearch.index.analysis.rosette.SimpleRosetteAnalysisTests.java

License:Open Source License

public static void assertSimpleTSOutput(TokenStream stream, String[] expected) throws IOException {
    stream.reset();
    CharTermAttribute termAttr = stream.getAttribute(CharTermAttribute.class);
    Assert.assertNotNull(termAttr);/*from ww w  .  ja  va2 s  .  com*/
    int i = 0;
    while (stream.incrementToken()) {
        String s = termAttr.toString();
        //out.printf("Output Token %2d: %s%n", i, s);
        Assert.assertTrue(i < expected.length, "got extra term: " + s);
        Assert.assertEquals(termAttr.toString(), expected[i], "expected different term at index " + i);
        i++;
    }
    Assert.assertEquals(i, expected.length, "not all tokens produced");
}