Example usage for org.apache.lucene.analysis TokenStream reset

List of usage examples for org.apache.lucene.analysis TokenStream reset

Introduction

In this page you can find the example usage for org.apache.lucene.analysis TokenStream reset.

Prototype

public void reset() throws IOException 

Source Link

Document

This method is called by a consumer before it begins consumption using #incrementToken() .

Usage

From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.TweetAnalyzerBolt.java

License:Apache License

@Override
public void execute(Tuple input) {
    try {/*  w w w .  j a v a 2  s  .  co m*/
        String tweet = (String) input.getValueByField(StreamIDs.TWEET);
        Reader reader = new StringReader(tweet);

        LanguageIdentifier identifier = new LanguageIdentifier(tweet);
        NewsRecLuceneAnalyzer analyzer = LanguageAnalyzerHelper.getInstance()
                .getAnalyzer(new Locale(identifier.getLanguage()));

        TokenStream tokenStream = analyzer.tokenStream("", reader);
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            collector.emit(StreamIDs.TERMSTREAM, new Values(term));
        }
        reader.close();
        tokenStream.close();

        for (String term : extractNames(tweet, analyzer.getStopwords())) {
            collector.emit(StreamIDs.TERMSTREAM, new Values(term));
        }
    } catch (IOException ex) {
        logger.error(ex);
    }
}

From source file:bixo.examples.webmining.PhraseShingleAnalyzer.java

License:Apache License

public List<String> getTermList(String contentText) {
    List<String> result = new ArrayList<String>(contentText.length() / 10);

    try {//  www .  j a  va 2 s  . c  om
        TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText));
        CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);

        stream.reset();
        while (stream.incrementToken()) {
            if (termAtt.length() > 0) {
                String term = termAtt.toString();
                result.add(term);
            }
        }
        stream.end();
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException("Impossible error", e);
    }

    return result;
}

From source file:br.bireme.ngrams.Tools.java

public static void showTokens(final Analyzer analyzer, final String fieldName, final String text)
        throws IOException {
    TokenStream tokenStream = analyzer.tokenStream(fieldName, text);
    OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class);
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    tokenStream.reset();
    while (tokenStream.incrementToken()) {
        int startOffset = offsetAttribute.startOffset();
        int endOffset = offsetAttribute.endOffset();
        final String term = charTermAttribute.toString();

        System.out.println(term + " [" + startOffset + "," + endOffset + "]");
    }/*from ww  w  .  jav  a  2s.  com*/
}

From source file:br.edu.utfpr.cm.JGitMinerWeb.services.matrix.auxiliary.LuceneUtil.java

public static List<String> tokenizeString(String linha) {

    Analyzer analyzer = new StopAnalyzer(Version.LUCENE_46);

    List<String> result = new ArrayList<>();

    try {/*from   ww w.  j a v  a2s  . c o  m*/
        TokenStream stream = analyzer.tokenStream(null, new StringReader(linha));
        stream.reset();
        while (stream.incrementToken()) {

            result.add(stream.getAttribute(CharTermAttribute.class).toString());

        }
    } catch (IOException e) {
        System.out.println(e.getMessage());

    }

    return result;
}

From source file:br.edu.utfpr.cm.JGitMinerWeb.util.LuceneUtil.java

public static List<String> tokenizeString(String linha) {

    Analyzer analyzer = new StopAnalyzer();

    List<String> result = new ArrayList<>();

    try {// w  w w .  j a v  a2  s .  c  o  m
        TokenStream stream = analyzer.tokenStream(null, new StringReader(linha));
        stream.reset();
        while (stream.incrementToken()) {

            result.add(stream.getAttribute(CharTermAttribute.class).toString());

        }
    } catch (IOException e) {
        System.out.println(e.getMessage());

    }

    return result;
}

From source file:br.ufmt.harmonizacao.implementer.PatenteeSearcher.java

public List<String> search(String field, String value) {
    try {/*from  w w w . jav a2  s  . c o  m*/
        long start = System.currentTimeMillis();
        TokenStream stream = analyzer.tokenStream(field, new StringReader(value));
        CharTermAttribute attr = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        String valor = "";
        while (stream.incrementToken()) {
            valor = valor + attr.toString() + ' ';
        }
        BooleanQuery bq = new BooleanQuery();
        BooleanQuery acronymBq = null;
        String query = "";
        BooleanQuery wrapBq = new BooleanQuery();
        String[] tokens = valor.split(" ");
        for (int i = 0; i < tokens.length; i++) {
            if (tokens.length >= 2) {
                acronymBq = new BooleanQuery();
                switch (i) {
                case 0:
                    acronymBq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST);
                    bq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD);
                    break;
                case 1:
                    acronymBq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST_NOT);
                    bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD);
                    bq.add(new LengthQuery(field, valor), BooleanClause.Occur.MUST_NOT);
                    break;
                default:
                    break;
                }
            } else {
                if (tokens[i].length() > 3) {
                    bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST);
                } else {
                    bq.add(new TermQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST);
                }
            }
        }

        stream.end();
        stream.close();
        // Aqui termina
        // Cria uma fuzzyquery, ela que far a busca de aproximao

        wrapBq.add(bq, BooleanClause.Occur.MUST);
        if (acronymBq != null) {
            //new QueryParser(Version.LUCENE_47, field, new StandardAnalyzer(Version.LUCENE_47)).parse(query)
            wrapBq.add(acronymBq, BooleanClause.Occur.MUST_NOT);
        }
        String queryTime = "Tempo para construo da query : " + (System.currentTimeMillis() - start) + "ms";
        // Pegando os documentos encontrado na pesquisa
        start = System.currentTimeMillis();
        ScoreDoc[] hits = searcher.search(wrapBq, 10).scoreDocs;
        String searchTime = "Tempo para busca : " + (System.currentTimeMillis() - start) + "ms";
        List<String> result = new ArrayList<String>();
        result.add(valor);
        if (hits.length > 0) {
            for (int i = 0; i < hits.length; i++) {
                Document hitDoc = searcher.doc(hits[i].doc);
                result.add(hitDoc.get(field));
            }
        }
        result.add(queryTime);
        result.add(searchTime);
        return result;
    } catch (IOException ex) {
        Logger.getLogger(PatenteeSearcher.class.getName()).log(Level.SEVERE, null, ex);
    }
    return null;
}

From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java

License:Open Source License

/**
 * Filter the string with StandardAnalyzer.
 * @param str/*from   w ww.  j  a v a 2  s  .  c  om*/
 * @param removeStopWords   Indicate if the stop words should be removed.
 * @return
 */
public static String processString(String str, boolean removeStopWords) {
    StringBuffer strBuf = new StringBuffer();

    try {
        Analyzer analyzer = null;
        if (removeStopWords)
            analyzer = new StandardAnalyzer(Version.LUCENE_34);
        else
            analyzer = new TextAnalyzerWithStopwords(Version.LUCENE_34);

        TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(str));
        CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            strBuf.append(term + " ");
        }

        analyzer.close();
    } catch (Exception e) {
        e.printStackTrace();
    }

    return strBuf.toString().trim();
}

From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java

License:Open Source License

/**
 * This function assumes that the TFIDF vector of the document containing text is already
 * given. We simply build a tfidf-vector of the text out of the docVector. 
 * The purpose of doing this is to save the time computing the tf-idf value for words in
 * the same document.// w ww . ja  v  a 2  s .  c  o m
 * 
 * @param text
 * @param docVector
 * @return
 */
public Map<String, Float> TextTFIDFVector(String text, Map<String, Float> docVector) {
    Map<String, Float> map = new HashMap<String, Float>();

    //preprocess the text using StandardAnalyzer (StandardAnalyzer2 + StopAnalyzer).
    StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_34);
    TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(text));
    CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class);

    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();

            if (docVector.containsKey(term))
                map.put(term, docVector.get(term));
        }
    } catch (Exception e) {
        e.printStackTrace();
    }

    analyzer.close();

    return map;
}

From source file:cl.usach.escalemania.sessionbeans.DocumentoFacade.java

public List<String> tokenizeString(Analyzer analyzer, String tweet) {

    List<String> result = new ArrayList<String>();
    try {//from   www . j av a  2  s . c  o  m
        TokenStream stream = analyzer.tokenStream(null, new StringReader(tweet));
        stream.reset();
        while (stream.incrementToken()) {
            result.add(stream.getAttribute(CharTermAttribute.class).toString());
        }
        stream.close();
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    return result;
}

From source file:cn.com.szgao.enterprise.ExecutorsIndustry.java

@SuppressWarnings("rawtypes")
 private void readFileByLines(File file, int startNum) throws Exception {

     // ??/*from  w ww .  j ava 2 s.co m*/
     int basicSum = 0;
     // ??
     int regSum = 0;

     // ???
     String batchNum = file.getName();
     // 
     StringBuffer sb = new StringBuffer();
     // FileReader fr = new FileReader(file);
     // int ch = 0;
     // while((ch = fr.read())!=-1 )
     // {
     // sb.append((char)ch);
     // }
     // fr.close();
     // fr = null;

     // BufferedWriter fw = null;
     String encoding_from = "UTF-8";// GB18030
     // String encoding_to = "UTF-8";
     BufferedReader reader = null;
     try {
         // ? GB18030

         // InputStreamReader isr = new InputStreamReader(new
         // FileInputStream(file), "UTF-8");
         // InputStreamReader isr = new InputStreamReader(new
         // FileInputStream(file), "GBK");
         InputStreamReader isr = new InputStreamReader(new FileInputStream(file), encoding_from);
         reader = new BufferedReader(isr);

     } catch (FileNotFoundException e1) {
         e1.printStackTrace();
     }

     String tempT = null;

     int readNum = 0;

     // E:\\data\\--0008.txt
     String ss = file.getPath().substring(file.getPath().indexOf("data") + 4);
     String ss2 = file.getPath().substring(file.getPath().indexOf("data") + 5, file.getPath().lastIndexOf("\\"));

     //      String folderPath = "D:/lm/log/?/" + ss2;
     String filePath = file.getPath().replace("???", "???industryId")
             .replace("E:", "D:");

     // 
     //      FileUtils.newFolder(folderPath);
     File fileS = new File(filePath);
     String encoding_from1 = "UTF-8";
     BufferedWriter fw = null;
     try {
         if (!fileS.exists()) {
             try {
                 fileS.createNewFile();
             } catch (IOException e) {
                 e.printStackTrace();
                 log.error(e);
             }
             fw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileS, true), encoding_from1)); // ????
         } else {
             fileS.delete();
             fileS = new File(filePath);
             fw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileS, true), encoding_from1)); // ????
         }
     } catch (FileNotFoundException e1) {
         e1.printStackTrace();
     }

     int M = 1;// 1M?
     while ((tempT = reader.readLine()) != null) {

         // tempT="";

         long size = tempT.getBytes().length;
         if (size > M * 1024 * 1024) {
             log.error("--------?1M :" + tempT.substring(0, 500));
             continue;
         }

         /**
          * 
          */
         JSONArray holderArray = null;
         /**
          * 
          */
         JSONArray holderDetArray = null;
         List<String> beforeList = new ArrayList<String>();
         List<String> afterList = new ArrayList<String>();

         EnterpriseVO enterVO = new EnterpriseVO();
         EnterpriseVO enterVOT = new EnterpriseVO();

         readNum++;

         // if(readNum<34431){
         // continue;
         // }
         // if(readNum==34431){
         //
         // writerString(fwUn, tempT);
         // break;
         // }

         if (readNum < startNum) {
             continue;
         }
         System.out.println("-->>>>>>> " + (readNum) + "---??" + Thread.currentThread().getName() + "---"
                 + file.getPath());
         if (tempT == null || tempT == "") {
             continue;
         }

         String doString = tempT.toString();
         JSONObject obj = null;
         // System.out.println("in..." + doString);
         try {
             enterVO = gs.fromJson(tempT, EnterpriseVO.class);
         } catch (Exception e) {
             e.printStackTrace();
             log.error(e);
             continue;
             // return;
         }

         String industry = null;
         // ?
         Analyzer anal = new IKAnalyzer();
         String indu = removeBlank(enterVO.getScope());
         IndustryVO ivo = null;
         IndustryVO ivo1 = null;
         if (StringUtils.isNull(industry)) {
             if (!StringUtils.isNull(indu)) {
                 String[] sourceStrArray = indu.split("[;.:,]");// ?
                 for (String str : sourceStrArray) {
                     // System.out.println("-- " + str);

                     ivo = getIndustry(str);
                     if (!StringUtils.isNull(ivo.getIndustry_id())) {
                         break;
                     }

                     // ik?
                     if (null == ivo.getIndustry_id()) {

                         StringReader reader1 = new StringReader(str);
                         // ?
                         TokenStream ts = null;
                         ts = anal.tokenStream("", reader1);
                         CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
                         try {
                             ts.reset();

                             String strs = "";
                             // ????
                             while (ts.incrementToken()) {
                                 String temp = term.toString();
                                 if (!StringUtils.isNull(temp)) {
                                     // getIndustry(temp);
                                     strs += term.toString() + "|";
                                     // System.out.print(term.toString() + "|" );
                                 }
                             }
                             reader1.close();

                             String[] arrStr1 = strs.split("\\|");
                             StringUtils.sortStringArray(arrStr1, 1);// ?
                             List<IndustryVO> listiv = new ArrayList<IndustryVO>();
                             List<IndustryVO> listiv_v = new ArrayList<IndustryVO>();
                             List<IndustryVO> listiv_n = new ArrayList<IndustryVO>();

                             for (int i = 0; i < arrStr1.length; i++) {
                                 String temp = arrStr1[i];
                                 if (!StringUtils.isNull(temp)) {
                                     ivo1 = getIndustry(temp);
                                     if (!StringUtils.isNull(ivo1.getIndustry_id())) {
                                         listiv.add(ivo1);
                                         // break;
                                     }
                                 }
                             }
                             // ?????
                             if (listiv.size() > 0) {
                                 for (IndustryVO industryVO : listiv) {
                                     if ("V".equals(industryVO.getFlag())) {
                                         listiv_v.add(industryVO);
                                     } else {
                                         listiv_n.add(industryVO);
                                     }
                                 }
                             }
                             if (listiv_v.size() > 0) {
                                 ivo = getfirstSortStringArray(listiv_v, 1);
                                 break;
                             }
                             if (listiv_n.size() > 0) {
                                 ivo = getfirstSortStringArray(listiv_n, 1);
                                 break;
                             }

                         } catch (IOException e) {
                             e.printStackTrace();
                         }
                     }

                     if (!StringUtils.isNull(ivo.getIndustry_id())) {
                         break;
                     }
                 }
             }
         }
         if (null != ivo) {
             enterVO.setIndustry(ivo.getIndustry_name());
             enterVO.setIndustryId(ivo.getIndustry_id());
         }
         // System.out.println("in..." + obj);

         writerString(fw, StringUtils.GSON.toJson(enterVO));
     }
     log.info("?regSum: " + regSum + " ?basicSum: " + basicSum + " readNum: "
             + readNum + " -: " + (readNum - basicSum - regSum) + "---??" + file.getPath());
 }