List of usage examples for org.apache.lucene.analysis TokenStream incrementToken
public abstract boolean incrementToken() throws IOException;
From source file:edu.upenn.library.solrplugins.CaseInsensitiveSortingTextField.java
License:Apache License
@Override public BytesRef normalizeQueryTarget(String val, boolean strict, String fieldName, boolean appendExtraDelim) throws IOException { TokenStream ts = getQueryAnalyzer().tokenStream(fieldName, val); try {//from ww w .j av a2s . c om ts.reset(); CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class); TypeAttribute typeAtt = ts.getAttribute(TypeAttribute.class); String matchType = strict ? INDEXED_TOKEN_TYPE : NORMALIZED_TOKEN_TYPE; while (ts.incrementToken()) { if (matchType.equals(typeAtt.type())) { BytesRefBuilder ret = new BytesRefBuilder(); ret.copyChars(termAtt.toString()); if (!strict || appendExtraDelim) { ret.append(delimBytes, 0, delimBytes.length); } return ret.get(); } } return new BytesRef(BytesRef.EMPTY_BYTES); } finally { ts.close(); } }
From source file:edu.utsa.sifter.DocMaker.java
License:Apache License
public static boolean addBodyField(final Document doc, final String body, final Analyzer analyzer, boolean testEmpty) throws IOException { final Field f = new Field("body", body, BodyOptions); if (testEmpty) { // System.out.println("testing if doc has empty body"); final TokenStream toks = f.tokenStream(analyzer); toks.reset();/* w ww . jav a 2 s. c o m*/ if (!toks.incrementToken()) { // System.out.println("empty body, won't index"); toks.close(); return false; } } doc.add(new Field("body", body, BodyOptions)); doc.add(new LongField("body-len", body.length(), Field.Store.YES)); return true; }
From source file:edu.virginia.cs.utility.StringTokenizer.java
/** * Method that generates list of tokens from the parameter string. * * @param string//w w w .j a va 2 s . c o m * @return list of tokens generated */ public List<String> TokenizeString(String string) { List<String> result = new ArrayList<>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.end(); stream.close(); } catch (IOException e) { throw new RuntimeException(e); } return result; }
From source file:elhuyar.bilakit.PayloadQParserPlugin.java
License:Open Source License
@Override protected Query getFieldQuery(String field, String queryText, boolean quoted) throws SyntaxError { SchemaField sf = this.schema.getFieldOrNull(field); if (!quoted && sf != null && sf.getType().getTypeName().endsWith("_payloads")) { //analyze queryText List<String> result = new ArrayList<String>(); try {//from ww w . j ava 2 s .c o m TokenStream stream = getAnalyzer().tokenStream(field, new StringReader(queryText)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } stream.end(); stream.close(); } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } String analyzedqueryText = ""; analyzedqueryText = result.toString().replaceAll("\\[|\\]", "").replaceAll(", ", " "); queryText = analyzedqueryText; // Note that this will work for any field defined with the // <fieldType> of "*_payloads" Query plter = new PayloadTermQuery(new Term(field, queryText), new AveragePayloadFunction(), true); return plter; } return super.getFieldQuery(field, queryText, quoted); }
From source file:engine.easy.analyzer.EasySearchAnalyzer.java
License:Apache License
private static void printResult(String text, Analyzer analyzer) throws IOException { int tokenCount = 0; TokenStream tokenStream = analyzer.tokenStream("FIELDNAME", new StringReader(text)); // this method will used for token streams TermAttribute termAtt = tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { tokenCount++;//www.j a v a 2s .c om String tokenText = new String(termAtt.termBuffer(), 0, termAtt.termLength()); System.out.println(" >> Token " + tokenCount + ": " + tokenText); } }
From source file:engine.easy.indexer.writer.EasySearchIndexWriter.java
License:Apache License
/** * Count the token stream tokens.//from w w w . ja v a 2 s.c o m * * @return it returns the no:of stream tokens. * @throws IOException if the file would have any IO operation. */ private static int[] countTokenStream(TokenStream tokenStream) throws IOException { int v[] = new int[2]; HashSet countTokenStreamBuffer = new HashSet(); TermAttribute termAtt = tokenStream.getAttribute(TermAttribute.class); while (tokenStream.incrementToken()) { v[0]++; countTokenStreamBuffer.add(new String(termAtt.termBuffer(), 0, termAtt.termLength())); } v[1] = countTokenStreamBuffer.size(); tokenStream.reset(); countTokenStreamBuffer.clear(); return v; }
From source file:fr.ericlab.sondy.core.DataManipulation.java
License:Open Source License
public void prepareStream(String datasetName, int intervalDuration, int ngram, String stemLanguage, boolean lemmatization, AppVariables appVariables) { try {/* ww w .j a va 2 s. c o m*/ Connection connection; Class.forName("com.mysql.jdbc.Driver").newInstance(); connection = DriverManager.getConnection("jdbc:mysql://" + appVariables.configuration.getHost(), appVariables.configuration.getUsername(), appVariables.configuration.getPassword()); Statement statement = connection.createStatement(); Statement statement2 = connection.createStatement(); String lemStr = (lemmatization) ? "_lem1" : "_lem0"; statement.executeUpdate("CREATE TABLE " + appVariables.configuration.getSchema() + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram + "gram ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, msg_author VARCHAR(100), msg_post_time TIMESTAMP, msg_text VARCHAR(600), time_slice INT)ENGINE=myisam;"); // statement.executeUpdate("CREATE INDEX index_time ON "+appVariables.configuration.getSchema()+"."+datasetName+"_messages (msg_post_time)"); ResultSet rsTMin = statement.executeQuery("select min(msg_post_time) from " + appVariables.configuration.getSchema() + "." + datasetName + "_messages;"); rsTMin.next(); Timestamp tMin = rsTMin.getTimestamp(1); ResultSet rsTMax = statement.executeQuery("select max(msg_post_time) from " + appVariables.configuration.getSchema() + "." + datasetName + "_messages;"); rsTMax.next(); Timestamp tMax = rsTMax.getTimestamp(1); Timestamp tRef = new Timestamp(0); long base = (tMin.getTime() - tRef.getTime()) * 1L; long streamDuration = (tMax.getTime() - tMin.getTime()) * 1L; long streamDurationMin = (streamDuration / 1000) / 60; String path = appVariables.configuration.getWorkspace() + "/datasets/" + datasetName + "/" + intervalDuration + "min-" + stemLanguage; path += (lemmatization) ? "-lem1" : "-lem0"; path += "-" + ngram + "gram"; String pathMention = path + "-m"; FSDirectory indexGlobal = FSDirectory.open(new File(path)); FSDirectory indexMention = FSDirectory.open(new File(pathMention)); Analyzer analyzer; Properties props = new Properties(); props.put("annotators", "tokenize,ssplit,parse,lemma"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation annotation; if (stemLanguage.equalsIgnoreCase("Standard")) { analyzer = new StandardAnalyzer(Version.LUCENE_36); } else { Class cl; if (stemLanguage.equals("Chinese")) { analyzer = new SmartChineseAnalyzer(Version.LUCENE_36); } else { String packageName = stemLanguage.substring(0, 2).toLowerCase(); cl = Class .forName("org.apache.lucene.analysis." + packageName + "." + stemLanguage + "Analyzer"); Class[] types = new Class[] { Version.class, Set.class }; Constructor ct = cl.getConstructor(types); analyzer = (Analyzer) ct.newInstance(Version.LUCENE_36, appVariables.currentStopWords.getSet()); } } IndexWriterConfig configGlobal; IndexWriterConfig configMention; ShingleAnalyzerWrapper shingleAnalyzer = null; if (ngram > 1) { shingleAnalyzer = new ShingleAnalyzerWrapper(analyzer, ngram, ngram, " ", false, false); WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_36); configGlobal = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer); configMention = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer); } else { configGlobal = new IndexWriterConfig(Version.LUCENE_36, analyzer); configMention = new IndexWriterConfig(Version.LUCENE_36, analyzer); } IndexWriter wGlobal = new IndexWriter(indexGlobal, configGlobal); IndexWriter wMention = new IndexWriter(indexMention, configMention); int docId = 0; for (int i = 0; i < streamDurationMin; i += intervalDuration) { statement = connection.createStatement(); long infBound = base + i * 60 * 1000L; long supBound = base + (i + intervalDuration) * 60 * 1000L; Timestamp infTime = new Timestamp(infBound); Timestamp supTime = new Timestamp(supBound); ResultSet rs = statement.executeQuery("SELECT msg_text, msg_post_time, msg_author FROM " + appVariables.configuration.getSchema() + "." + datasetName + "_messages WHERE msg_post_time>'" + infTime + "' AND msg_post_time< '" + supTime + "'"); String globalContent = new String(); String mentionContent = new String(); String timestamps = new String(); NumberFormat formatter = new DecimalFormat("00000000"); int bulk = 0; String bulkString = ""; boolean mention; while (rs.next()) { String message = rs.getString(1).toLowerCase(); mention = message.contains("@"); if (lemmatization) { annotation = new Annotation(message); message = ""; pipeline.annotate(annotation); List<CoreMap> lem = annotation.get(SentencesAnnotation.class); for (CoreMap l : lem) { for (CoreLabel token : l.get(TokensAnnotation.class)) { message += token.get(LemmaAnnotation.class) + " "; } } } if (ngram > 1) { String processedMessage = ""; TokenStream tokenStream = shingleAnalyzer.tokenStream("text", new StringReader(message)); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { String termToken = charTermAttribute.toString(); if (!termToken.contains("_")) { processedMessage += termToken.replace(" ", "=") + " "; } } message = processedMessage; } bulk++; if (bulk < _BULK_SIZE_) { bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\"" + rs.getString(3) + "\"),"; } else { bulk = 0; bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\"" + rs.getString(3) + "\");"; statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES" + bulkString); bulkString = ""; } globalContent += message + "\n"; if (mention) { mentionContent += message + "\n"; } timestamps += rs.getString(2) + "\n"; } if (bulk > 0 && bulkString.length() > 0) { statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES" + bulkString.substring(0, bulkString.length() - 1) + ";"); } Document docGlobal = new Document(); docGlobal.add(new Field("content", globalContent, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); docGlobal.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED)); wGlobal.addDocument(docGlobal); wGlobal.commit(); Document docMention = new Document(); docMention.add(new Field("content", mentionContent, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); docMention.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED)); wMention.addDocument(docMention); wMention.commit(); File textFile = new File(path + "/input/" + formatter.format(docId) + ".text"); FileUtils.writeStringToFile(textFile, globalContent); File timeFile = new File(path + "/input/" + formatter.format(docId) + ".time"); FileUtils.writeStringToFile(timeFile, timestamps); docId++; statement.close(); } statement2.executeUpdate("CREATE INDEX index_time_slice ON " + appVariables.configuration.getSchema() + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram + "gram (time_slice);"); statement2.executeUpdate("CREATE FULLTEXT INDEX index_text ON " + appVariables.configuration.getSchema() + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram + "gram (msg_text);"); statement2.close(); connection.close(); wGlobal.close(); wMention.close(); } catch (IOException ex) { Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex); } catch (SQLException | InstantiationException | IllegalAccessException | ClassNotFoundException ex) { Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex); } catch (NoSuchMethodException | SecurityException | IllegalArgumentException | InvocationTargetException ex) { Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:fr.inrialpes.exmo.ontosim.string.CommonWords.java
License:Open Source License
private void extractTerms(String e) { Set<String> s = new LinkedHashSet<String>(); TokenStream ts = analyzer.tokenStream("", new StringReader(e)); TermAttribute termAtt = ts.addAttribute(TermAttribute.class); try {//from w w w . jav a 2 s. c o m while (ts.incrementToken()) { s.add(termAtt.term()); } } catch (IOException ex) { ex.printStackTrace(); } /* Token token; try { while ((token = ts.next()) != null) { s.add(token.termText()); } } catch (IOException ex) { ex.printStackTrace(); } */ map.put(e, s); }
From source file:fr.inrialpes.exmo.ontosim.string.JWNLDistances.java
License:Open Source License
/** * Takes a gloss-like string (text) and returns it tokenized. * with:/*from w w w .ja v a 2 s . com*/ * - stopwords * - lower case * - porter stemmer */ protected Set<String> tokenizeGloss(String s) throws IOException { Set<String> result = new HashSet<String>(); // I am affraid that I am reimplementing the StandardAnalizer... TokenStream ts = new PorterStemFilter( new StopFilter(true, new LowerCaseTokenizer(new StringReader(s)), stopWords, true)); TermAttribute termAtt = ts.addAttribute(TermAttribute.class); while (ts.incrementToken()) { result.add(termAtt.term()); } return result; }
From source file:fr.inrialpes.exmo.ontosim.VectorSpaceMeasure.java
License:Open Source License
/** * add all words contained in toAnalyse into words collection. Words are stemmed. * @param toAnalyse : the string to be analysed * @param words : the collection to add extracted words *//* w ww. j a v a 2 s . c o m*/ protected void analyseString(String toAnalyse, Collection<String> words) { TokenStream tokenS = analyzer.tokenStream("", new StringReader(toAnalyse)); TermAttribute termAtt = tokenS.addAttribute(TermAttribute.class); try { while (tokenS.incrementToken()) { words.add(termAtt.term()); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }