List of usage examples for org.apache.lucene.analysis TokenStream reset
public void reset() throws IOException
From source file:be.ugent.tiwi.sleroux.newsrec.newsreclib.newsFetch.storm.bolts.TweetAnalyzerBolt.java
License:Apache License
@Override public void execute(Tuple input) { try {/* w w w . j a v a 2 s . co m*/ String tweet = (String) input.getValueByField(StreamIDs.TWEET); Reader reader = new StringReader(tweet); LanguageIdentifier identifier = new LanguageIdentifier(tweet); NewsRecLuceneAnalyzer analyzer = LanguageAnalyzerHelper.getInstance() .getAnalyzer(new Locale(identifier.getLanguage())); TokenStream tokenStream = analyzer.tokenStream("", reader); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); collector.emit(StreamIDs.TERMSTREAM, new Values(term)); } reader.close(); tokenStream.close(); for (String term : extractNames(tweet, analyzer.getStopwords())) { collector.emit(StreamIDs.TERMSTREAM, new Values(term)); } } catch (IOException ex) { logger.error(ex); } }
From source file:bixo.examples.webmining.PhraseShingleAnalyzer.java
License:Apache License
public List<String> getTermList(String contentText) { List<String> result = new ArrayList<String>(contentText.length() / 10); try {// www . j a va 2 s . c om TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText)); CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { String term = termAtt.toString(); result.add(term); } } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException("Impossible error", e); } return result; }
From source file:br.bireme.ngrams.Tools.java
public static void showTokens(final Analyzer analyzer, final String fieldName, final String text) throws IOException { TokenStream tokenStream = analyzer.tokenStream(fieldName, text); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); final String term = charTermAttribute.toString(); System.out.println(term + " [" + startOffset + "," + endOffset + "]"); }/*from ww w . jav a 2s. com*/ }
From source file:br.edu.utfpr.cm.JGitMinerWeb.services.matrix.auxiliary.LuceneUtil.java
public static List<String> tokenizeString(String linha) { Analyzer analyzer = new StopAnalyzer(Version.LUCENE_46); List<String> result = new ArrayList<>(); try {/*from ww w. j a v a2s . c o m*/ TokenStream stream = analyzer.tokenStream(null, new StringReader(linha)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { System.out.println(e.getMessage()); } return result; }
From source file:br.edu.utfpr.cm.JGitMinerWeb.util.LuceneUtil.java
public static List<String> tokenizeString(String linha) { Analyzer analyzer = new StopAnalyzer(); List<String> result = new ArrayList<>(); try {// w w w . j a v a2 s . c o m TokenStream stream = analyzer.tokenStream(null, new StringReader(linha)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { System.out.println(e.getMessage()); } return result; }
From source file:br.ufmt.harmonizacao.implementer.PatenteeSearcher.java
public List<String> search(String field, String value) { try {/*from w w w . jav a2 s . c o m*/ long start = System.currentTimeMillis(); TokenStream stream = analyzer.tokenStream(field, new StringReader(value)); CharTermAttribute attr = stream.getAttribute(CharTermAttribute.class); stream.reset(); String valor = ""; while (stream.incrementToken()) { valor = valor + attr.toString() + ' '; } BooleanQuery bq = new BooleanQuery(); BooleanQuery acronymBq = null; String query = ""; BooleanQuery wrapBq = new BooleanQuery(); String[] tokens = valor.split(" "); for (int i = 0; i < tokens.length; i++) { if (tokens.length >= 2) { acronymBq = new BooleanQuery(); switch (i) { case 0: acronymBq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST); bq.add(new PrefixQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD); break; case 1: acronymBq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST_NOT); bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.SHOULD); bq.add(new LengthQuery(field, valor), BooleanClause.Occur.MUST_NOT); break; default: break; } } else { if (tokens[i].length() > 3) { bq.add(new FuzzyQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST); } else { bq.add(new TermQuery(new Term(field, tokens[i])), BooleanClause.Occur.MUST); } } } stream.end(); stream.close(); // Aqui termina // Cria uma fuzzyquery, ela que far a busca de aproximao wrapBq.add(bq, BooleanClause.Occur.MUST); if (acronymBq != null) { //new QueryParser(Version.LUCENE_47, field, new StandardAnalyzer(Version.LUCENE_47)).parse(query) wrapBq.add(acronymBq, BooleanClause.Occur.MUST_NOT); } String queryTime = "Tempo para construo da query : " + (System.currentTimeMillis() - start) + "ms"; // Pegando os documentos encontrado na pesquisa start = System.currentTimeMillis(); ScoreDoc[] hits = searcher.search(wrapBq, 10).scoreDocs; String searchTime = "Tempo para busca : " + (System.currentTimeMillis() - start) + "ms"; List<String> result = new ArrayList<String>(); result.add(valor); if (hits.length > 0) { for (int i = 0; i < hits.length; i++) { Document hitDoc = searcher.doc(hits[i].doc); result.add(hitDoc.get(field)); } } result.add(queryTime); result.add(searchTime); return result; } catch (IOException ex) { Logger.getLogger(PatenteeSearcher.class.getName()).log(Level.SEVERE, null, ex); } return null; }
From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java
License:Open Source License
/** * Filter the string with StandardAnalyzer. * @param str/*from w ww. j a v a 2 s . c om*/ * @param removeStopWords Indicate if the stop words should be removed. * @return */ public static String processString(String str, boolean removeStopWords) { StringBuffer strBuf = new StringBuffer(); try { Analyzer analyzer = null; if (removeStopWords) analyzer = new StandardAnalyzer(Version.LUCENE_34); else analyzer = new TextAnalyzerWithStopwords(Version.LUCENE_34); TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(str)); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); strBuf.append(term + " "); } analyzer.close(); } catch (Exception e) { e.printStackTrace(); } return strBuf.toString().trim(); }
From source file:ca.ualberta.entitylinking.common.indexing.TFIDF3x.java
License:Open Source License
/** * This function assumes that the TFIDF vector of the document containing text is already * given. We simply build a tfidf-vector of the text out of the docVector. * The purpose of doing this is to save the time computing the tf-idf value for words in * the same document.// w ww . ja v a 2 s . c o m * * @param text * @param docVector * @return */ public Map<String, Float> TextTFIDFVector(String text, Map<String, Float> docVector) { Map<String, Float> map = new HashMap<String, Float>(); //preprocess the text using StandardAnalyzer (StandardAnalyzer2 + StopAnalyzer). StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_34); TokenStream tokenStream = analyzer.tokenStream("string", new StringReader(text)); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); try { tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); if (docVector.containsKey(term)) map.put(term, docVector.get(term)); } } catch (Exception e) { e.printStackTrace(); } analyzer.close(); return map; }
From source file:cl.usach.escalemania.sessionbeans.DocumentoFacade.java
public List<String> tokenizeString(Analyzer analyzer, String tweet) { List<String> result = new ArrayList<String>(); try {//from www . j av a 2 s . c o m TokenStream stream = analyzer.tokenStream(null, new StringReader(tweet)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return result; }
From source file:cn.com.szgao.enterprise.ExecutorsIndustry.java
@SuppressWarnings("rawtypes") private void readFileByLines(File file, int startNum) throws Exception { // ??/*from w ww . j ava 2 s.co m*/ int basicSum = 0; // ?? int regSum = 0; // ??? String batchNum = file.getName(); // StringBuffer sb = new StringBuffer(); // FileReader fr = new FileReader(file); // int ch = 0; // while((ch = fr.read())!=-1 ) // { // sb.append((char)ch); // } // fr.close(); // fr = null; // BufferedWriter fw = null; String encoding_from = "UTF-8";// GB18030 // String encoding_to = "UTF-8"; BufferedReader reader = null; try { // ? GB18030 // InputStreamReader isr = new InputStreamReader(new // FileInputStream(file), "UTF-8"); // InputStreamReader isr = new InputStreamReader(new // FileInputStream(file), "GBK"); InputStreamReader isr = new InputStreamReader(new FileInputStream(file), encoding_from); reader = new BufferedReader(isr); } catch (FileNotFoundException e1) { e1.printStackTrace(); } String tempT = null; int readNum = 0; // E:\\data\\--0008.txt String ss = file.getPath().substring(file.getPath().indexOf("data") + 4); String ss2 = file.getPath().substring(file.getPath().indexOf("data") + 5, file.getPath().lastIndexOf("\\")); // String folderPath = "D:/lm/log/?/" + ss2; String filePath = file.getPath().replace("???", "???industryId") .replace("E:", "D:"); // // FileUtils.newFolder(folderPath); File fileS = new File(filePath); String encoding_from1 = "UTF-8"; BufferedWriter fw = null; try { if (!fileS.exists()) { try { fileS.createNewFile(); } catch (IOException e) { e.printStackTrace(); log.error(e); } fw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileS, true), encoding_from1)); // ???? } else { fileS.delete(); fileS = new File(filePath); fw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileS, true), encoding_from1)); // ???? } } catch (FileNotFoundException e1) { e1.printStackTrace(); } int M = 1;// 1M? while ((tempT = reader.readLine()) != null) { // tempT=""; long size = tempT.getBytes().length; if (size > M * 1024 * 1024) { log.error("--------?1M :" + tempT.substring(0, 500)); continue; } /** * */ JSONArray holderArray = null; /** * */ JSONArray holderDetArray = null; List<String> beforeList = new ArrayList<String>(); List<String> afterList = new ArrayList<String>(); EnterpriseVO enterVO = new EnterpriseVO(); EnterpriseVO enterVOT = new EnterpriseVO(); readNum++; // if(readNum<34431){ // continue; // } // if(readNum==34431){ // // writerString(fwUn, tempT); // break; // } if (readNum < startNum) { continue; } System.out.println("-->>>>>>> " + (readNum) + "---??" + Thread.currentThread().getName() + "---" + file.getPath()); if (tempT == null || tempT == "") { continue; } String doString = tempT.toString(); JSONObject obj = null; // System.out.println("in..." + doString); try { enterVO = gs.fromJson(tempT, EnterpriseVO.class); } catch (Exception e) { e.printStackTrace(); log.error(e); continue; // return; } String industry = null; // ? Analyzer anal = new IKAnalyzer(); String indu = removeBlank(enterVO.getScope()); IndustryVO ivo = null; IndustryVO ivo1 = null; if (StringUtils.isNull(industry)) { if (!StringUtils.isNull(indu)) { String[] sourceStrArray = indu.split("[;.:,]");// ? for (String str : sourceStrArray) { // System.out.println("-- " + str); ivo = getIndustry(str); if (!StringUtils.isNull(ivo.getIndustry_id())) { break; } // ik? if (null == ivo.getIndustry_id()) { StringReader reader1 = new StringReader(str); // ? TokenStream ts = null; ts = anal.tokenStream("", reader1); CharTermAttribute term = ts.getAttribute(CharTermAttribute.class); try { ts.reset(); String strs = ""; // ???? while (ts.incrementToken()) { String temp = term.toString(); if (!StringUtils.isNull(temp)) { // getIndustry(temp); strs += term.toString() + "|"; // System.out.print(term.toString() + "|" ); } } reader1.close(); String[] arrStr1 = strs.split("\\|"); StringUtils.sortStringArray(arrStr1, 1);// ? List<IndustryVO> listiv = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_v = new ArrayList<IndustryVO>(); List<IndustryVO> listiv_n = new ArrayList<IndustryVO>(); for (int i = 0; i < arrStr1.length; i++) { String temp = arrStr1[i]; if (!StringUtils.isNull(temp)) { ivo1 = getIndustry(temp); if (!StringUtils.isNull(ivo1.getIndustry_id())) { listiv.add(ivo1); // break; } } } // ????? if (listiv.size() > 0) { for (IndustryVO industryVO : listiv) { if ("V".equals(industryVO.getFlag())) { listiv_v.add(industryVO); } else { listiv_n.add(industryVO); } } } if (listiv_v.size() > 0) { ivo = getfirstSortStringArray(listiv_v, 1); break; } if (listiv_n.size() > 0) { ivo = getfirstSortStringArray(listiv_n, 1); break; } } catch (IOException e) { e.printStackTrace(); } } if (!StringUtils.isNull(ivo.getIndustry_id())) { break; } } } } if (null != ivo) { enterVO.setIndustry(ivo.getIndustry_name()); enterVO.setIndustryId(ivo.getIndustry_id()); } // System.out.println("in..." + obj); writerString(fw, StringUtils.GSON.toJson(enterVO)); } log.info("?regSum: " + regSum + " ?basicSum: " + basicSum + " readNum: " + readNum + " -: " + (readNum - basicSum - regSum) + "---??" + file.getPath()); }