List of usage examples for org.apache.lucene.analysis TokenStream end
public void end() throws IOException
false
(using the new TokenStream
API). From source file:pt.unlfctdi.cryptosearch.core.client.PrototypeClientConnector.java
License:Apache License
public void addFirstDocuments(File[] docs) { try {/*from w ww .j a va2s .c o m*/ // File f = new File(path); // File[] docs = f.listFiles(); for (int i = 0; i < docs.length; i++) { String content = Utils.readFileAsString(docs[i]); List<WordKey> cipheredWords = new ArrayList<WordKey>(); TokenStream ts = analyzer.tokenStream(null, new BufferedReader(new StringReader(content))); try { ts.reset(); while (ts.incrementToken()) { String word = ts.getAttribute(CharTermAttribute.class).toString(); if (word.length() > 0) cipheredWords.add(new WordKey(crypto.encryptWordKey(word))); } ts.end(); } finally { ts.close(); } search.addFirstDocuments(crypto.encryptAES(docs[i].getName().getBytes()), cipheredWords); storage.putDoc("" + i, crypto.encryptAES(Utils.serializeObject(new PDocument(docs[i].getName(), content)))); } } catch (Exception e) { e.printStackTrace(); } }
From source file:relevantfile.XmlParser.java
License:Open Source License
/********************************************************************************************/ public static String removeStopWordsAndStem(String input) throws IOException { /*String[] stop_word={"abstract","assert","boolean","break","byte","case","catch","char","class","const","continue" ,"default","do","double","else","enum","extends","final","finally","float","for","goto","if","implements","import","instanceof","int" ,"interface","long","native","new","package","private","protected","public","return","short","static","strictfp","super", "switch","synchronized","this","throw","throws","transient","try","void","volatile","while","false","null","true"};*/ String[] stop_word = { "auto", "break", "case", "char", "const", "continue", "default", "do", "double", "else", "enum", "extern", "float", "for", "goto", "if", "int", "long", "register", "return", "short", "signed", "sizeof", "static", "struct", "switch", "typedef", "union", "unsigned", "void", "volatile", "while", "abstract", "as", "base", "bool", "byte", "catch", "checked", "class", "decimal", "delegate", "event", "explicit", "false", "finally", "fixed", "foreach", "implicit", "in", "interface", "internal", "is", "lock", "namespace", "new", "null", "object", "operator", "out", "override", "params", "private", "protected", "public", "readonly", "ref", "sbyte", "sealed", "stackalloc", "string", "this", "throw", "true", "try", "typeof", "uint", "ulong", "unchecked", "unsafe", "ushort", "using", "virtual" }; ArrayList<String> stopWords = new ArrayList<String>(); for (int k = 0; k < stop_word.length; k++) stopWords.add(stop_word[k]);/*from w w w . j a v a2 s . c om*/ TokenStream tokenStream = new StandardTokenizer(Version.LUCENE_46, new StringReader(input)); tokenStream = new StopFilter(Version.LUCENE_46, tokenStream, StandardAnalyzer.STOP_WORDS_SET); tokenStream = new StopFilter(Version.LUCENE_46, tokenStream, StopFilter.makeStopSet(Version.LUCENE_46, stopWords)); tokenStream = new PorterStemFilter(tokenStream); StringBuilder sb = new StringBuilder(); CharTermAttribute token = tokenStream.getAttribute(CharTermAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { if (sb.length() > 0) { sb.append(" "); } sb.append(token.toString()); } tokenStream.end(); tokenStream.close(); return sb.toString(); }
From source file:retriever.TermFreq.java
String analyze(String query) throws Exception { StringBuffer buff = new StringBuffer(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(query)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();//w ww. j ava 2s . com while (stream.incrementToken()) { String term = termAtt.toString(); term = term.toLowerCase(); buff.append(term).append(" "); } stream.end(); stream.close(); return buff.toString(); }
From source file:ri.trabri.Lucene.java
protected ArrayList<String> geraTokens(String text) throws IOException { TokenStream stream = this.analyzer.tokenStream(null, new StringReader(text)); ArrayList<String> words = new ArrayList<>(); CharTermAttribute cattr = stream.addAttribute(CharTermAttribute.class); stream.reset();// ww w. ja va 2 s. co m while (stream.incrementToken()) { //System.out.println(cattr.toString()); words.add(cattr.toString()); } stream.end(); stream.close(); return words; }
From source file:searching.QueryExpansion.java
/** * /*from ww w. j ava 2 s . c o m*/ * store frequencies of top docs in maps * * @param text * @param doc_score * @param analyzer * @param reader * @throws IOException */ public void addExpansionDoc(String text, double doc_score, Analyzer analyzer, IndexReader reader) throws IOException { if (actual_pdocs < QueryExpansion.pseudo_rel_docs) { TreeMap<String, Double> map = new TreeMap(); Integer length = 0; Double f; String term; TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text)); //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); try { ts.reset(); // Resets this stream to the beginning. (Required) while (ts.incrementToken()) { term = ts.getAttribute(CharTermAttribute.class).toString(); if (term.length() > 1) { f = map.get(term); if (f == null) { map.put(term, 1.0); } else { map.put(term, f + 1.0); } length++; } } ts.end(); } finally { ts.close(); } pdocs[actual_pdocs] = map; pdoc_lengths[actual_pdocs] = length; pdoc_scores[actual_pdocs] = doc_score; //logger.info(observed_bg_mass[iter] + "\t" + (1-observed_bg_prob)); actual_pdocs++; } }
From source file:searching.QueryExpansion.java
/** * calculate positional relevance weights * //from w w w . ja v a 2 s . c om * @param query * @param text * @param doc_score * @param analyzer * @param reader * @throws IOException */ public void addPositionalExpansionDoc(CustomQuery query, String text, double doc_score, Analyzer analyzer, IndexReader reader) throws IOException { //System.out.println(query); //System.out.println(text); if (actual_pdocs < QueryExpansion.pseudo_rel_docs) { TreeMap<String, ArrayList<Long>> query_term_pos = new TreeMap(); Integer length = 0; Long pos = 1L; String term; TokenStream ts = analyzer.tokenStream("myfield", new StringReader(text)); ArrayList<Long> qpos; //OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class); try { ts.reset(); // Resets this stream to the beginning. (Required) while (ts.incrementToken()) { term = ts.getAttribute(CharTermAttribute.class).toString(); if (term.length() > 1) { //System.out.print(pos + ":" + term + " "); if (query.contains(term)) { qpos = query_term_pos.get(term); if (qpos == null) { qpos = new ArrayList<>(); } qpos.add(pos); query_term_pos.put(term, qpos); } length++; pos++; } } ts.end(); } finally { ts.close(); } // // All positions collected // now iterate over the document again to get weights // //System.out.println("Doc length" + text.length()); //System.out.println("Positions... "); //System.out.println(query_term_pos.toString()); //System.out.println("END..."); TreeMap<String, Double> map = new TreeMap(); Double f; pos = 1L; double w, w_norm, prob, f0; Double pos_length = 0.0; Double sum_df = (double) reader.getSumDocFreq("text"); double spud_pi = SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega / (query_term_pos.size() * (1 - SPUDLMSimilarity.omega) + SPUDLMSimilarity.b0 * SPUDLMSimilarity.omega); Double df; double dist; ts = analyzer.tokenStream("myfield", new StringReader(text)); try { ts.reset(); // Resets this stream to the beginning. (Required) while (ts.incrementToken()) { term = ts.getAttribute(CharTermAttribute.class).toString(); if (term.length() > 1) { prob = 0.0; //f is occurrence w_norm = Math.sqrt(2 * Math.PI * prm_sigma * prm_sigma); for (String qt : query_term_pos.keySet()) { ArrayList<Long> pos_list = query_term_pos.get(qt); w = 1.0; df = (double) reader.docFreq(new Term("text", qt)); for (Long p : pos_list) { dist = ((pos - p) * (pos - p)) / (2 * prm_sigma * prm_sigma); f0 = Math.exp(-dist); //if (QueryExpansion.method == QueryExpansion.PRM2QTM){ //w += (((double) ((1 - spud_pi) * f0) / (((1 - spud_pi) * f0 ) + spud_pi * (df / sum_df)))); // w += f0; //}else{ w += f0; //} } //System.out.println("weight " + w ); prob += Math.log(w / w_norm); } //System.out.print(pos + "\t" + term + "\t" + Math.exp(prob) + "\n"); /** sum of the probabilities over the positional terms in the documents*/ f = map.get(term); if (f == null) { map.put(term, Math.exp(prob)); } else { map.put(term, f + Math.exp(prob)); } pos_length += Math.exp(prob); pos++; } } ts.end(); } finally { ts.close(); } double sum = 0.0; for (String word : map.keySet()) { //logger.info(word + "\t" + map.get(word)/pos_length); sum += map.get(word) / pos_length; } //logger.info("sum is " + sum); pdocs[actual_pdocs] = map; pdoc_lengths[actual_pdocs] = length; pdoc_positional_lengths[actual_pdocs] = pos_length; pdoc_scores[actual_pdocs] = doc_score; actual_pdocs++; } }
From source file:servlets.TermStatsComparator.java
String analyze(String query) { StringBuffer buff = new StringBuffer(); try {/* w w w .ja v a2s .c o m*/ Analyzer analyzer = retriever.getAnalyzer(); TokenStream stream = analyzer.tokenStream("dummy", new StringReader(query)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAtt.toString(); buff.append(term); break; } stream.end(); stream.close(); } catch (Exception ex) { ex.printStackTrace(); return query; } return buff.toString(); }
From source file:stackoverflow.lucene.modified.MoreLikeThis.java
License:Apache License
/** * Adds term frequencies found by tokenizing text from reader into the Map words * * @param r a source of text to be tokenized * @param termFreqMap a Map of terms and their frequencies * @param fieldName Used by analyzer for any special per-field analysis *///from w ww . j av a 2 s .c om private void addTermFrequencies(Reader r, Map<String, Int> termFreqMap, String fieldName) throws IOException { if (analyzer == null) { throw new UnsupportedOperationException( "To use MoreLikeThis without " + "term vectors, you must provide an Analyzer"); } TokenStream ts = analyzer.tokenStream(fieldName, r); int tokenCount = 0; // for every token CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { String word = termAtt.toString(); tokenCount++; if (tokenCount > maxNumTokensParsed) { break; } if (isNoiseWord(word)) { continue; } // increment frequency Int cnt = termFreqMap.get(word); if (cnt == null) { termFreqMap.put(word, new Int()); } else { cnt.x++; } } ts.end(); ts.close(); }
From source file:test.AnalzyerDemo.java
License:Apache License
public static void main(String[] args) { Analyzer analyzer = new BaseAnalyzer(); // Analyzer analyzer = new org.apache.lucene.analysis.cjk.CJKAnalyzer(); // ?LuceneTokenStream TokenStream ts = null; try {/* w w w . j a va 2 s. co m*/ ts = analyzer.tokenStream("myfield", new StringReader( "????????????????2?3noneok???BaseAnalyer can analysis english text too")); // ??? OffsetAttribute offset = ts.addAttribute(OffsetAttribute.class); // ?? CharTermAttribute term = ts.addAttribute(CharTermAttribute.class); // ?? TypeAttribute type = ts.addAttribute(TypeAttribute.class); // ?TokenStream?StringReader ts.reset(); // ?? while (ts.incrementToken()) { System.out.println(offset.startOffset() + " - " + offset.endOffset() + " : " + term.toString() + " | " + type.type()); } // TokenStreamStringReader ts.end(); // Perform end-of-stream operations, e.g. set the final offset. } catch (IOException e) { e.printStackTrace(); analyzer.close(); } finally { // TokenStream? if (ts != null) { try { ts.close(); } catch (IOException e) { e.printStackTrace(); } } } }
From source file:tweetembeding.AnalyzerClass.java
public String analizeString(String FIELD, String txt) throws IOException { this.analyzer = setAnalyzer(); TokenStream stream = analyzer.tokenStream(FIELD, new StringReader(txt)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset();/*from ww w.j av a 2s .com*/ StringBuffer tokenizedContentBuff = new StringBuffer(); while (stream.incrementToken()) { String term = termAtt.toString(); if (!term.equals("nbsp")) tokenizedContentBuff.append(term).append(" "); } stream.end(); stream.close(); return tokenizedContentBuff.toString(); }