List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:edu.mit.ll.vizlinc.highlight.Highlighter.java
License:Apache License
/** * Low level api to get the most relevant (formatted) sections of the document. * This method has been made public to allow visibility of score information held in TextFragment objects. * Thanks to Jason Calabrese for help in redefining the interface. * @param tokenStream/* ww w .j ava 2 s. co m*/ * @param text * @param maxNumFragments * @param mergeContiguousFragments * @throws IOException * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length */ public final TextFragment[] getBestTextFragments(TokenStream tokenStream, String text, boolean mergeContiguousFragments, int maxNumFragments) throws IOException, InvalidTokenOffsetsException { ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>(); StringBuilder newText = new StringBuilder(); CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.reset(); TextFragment currentFrag = new TextFragment(newText, newText.length(), docFrags.size()); if (fragmentScorer instanceof QueryScorer) { ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); } TokenStream newStream = fragmentScorer.init(tokenStream); if (newStream != null) { tokenStream = newStream; } fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); try { String tokenText; int startOffset; int endOffset; int lastEndOffset = 0; textFragmenter.start(text, tokenStream); TokenGroup tokenGroup = new TokenGroup(tokenStream); for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset() < maxDocCharsToAnalyze); next = tokenStream.incrementToken()) { if ((offsetAtt.endOffset() > text.length()) || (offsetAtt.startOffset() > text.length())) { throw new InvalidTokenOffsetsException("Token " + termAtt.toString() + " exceeds length of provided text sized " + text.length()); } if ((tokenGroup.numTokens > 0) && (tokenGroup.isDistinct())) { //the current token is distinct from previous tokens - // markup the cached token group info startOffset = tokenGroup.matchStartOffset; endOffset = tokenGroup.matchEndOffset; tokenText = text.substring(startOffset, endOffset); String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group if (startOffset > lastEndOffset) newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); newText.append(markedUpText); lastEndOffset = Math.max(endOffset, lastEndOffset); tokenGroup.clear(); //check if current token marks the start of a new fragment if (textFragmenter.isNewFragment()) { currentFrag.setScore(fragmentScorer.getFragmentScore()); //record stats for a new fragment currentFrag.textEndPos = newText.length(); currentFrag = new TextFragment(newText, newText.length(), docFrags.size()); fragmentScorer.startFragment(currentFrag); docFrags.add(currentFrag); } } tokenGroup.addToken(fragmentScorer.getTokenScore()); // if(lastEndOffset>maxDocBytesToAnalyze) // { // break; // } } currentFrag.setScore(fragmentScorer.getFragmentScore()); if (tokenGroup.numTokens > 0) { //flush the accumulated text (same code as in above loop) startOffset = tokenGroup.matchStartOffset; endOffset = tokenGroup.matchEndOffset; tokenText = text.substring(startOffset, endOffset); String markedUpText = formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); //store any whitespace etc from between this and last group if (startOffset > lastEndOffset) newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); newText.append(markedUpText); lastEndOffset = Math.max(lastEndOffset, endOffset); } //Test what remains of the original text beyond the point where we stopped analyzing if ( // if there is text beyond the last token considered.. (lastEndOffset < text.length()) && // and that text is not too large... (text.length() <= maxDocCharsToAnalyze)) { //append it to the last fragment newText.append(encoder.encodeText(text.substring(lastEndOffset))); } currentFrag.textEndPos = newText.length(); //sort the most relevant sections of the text for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();) { currentFrag = i.next(); //If you are running with a version of Lucene before 11th Sept 03 // you do not have PriorityQueue.insert() - so uncomment the code below /* if (currentFrag.getScore() >= minScore) { fragQueue.put(currentFrag); if (fragQueue.size() > maxNumFragments) { // if hit queue overfull fragQueue.pop(); // remove lowest in hit queue minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore } } */ //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 //fix to PriorityQueue. The correct method to use here is the new "insert" method // USE ABOVE CODE IF THIS DOES NOT COMPILE! fragQueue.insertWithOverflow(currentFrag); } //return the most relevant fragments TextFragment frag[] = new TextFragment[fragQueue.size()]; for (int i = frag.length - 1; i >= 0; i--) { frag[i] = fragQueue.pop(); } //merge any contiguous fragments to improve readability if (mergeContiguousFragments) { mergeContiguousFragments(frag); ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>(); for (int i = 0; i < frag.length; i++) { if ((frag[i] != null) && (frag[i].getScore() > 0)) { fragTexts.add(frag[i]); } } frag = fragTexts.toArray(new TextFragment[0]); } return frag; } finally { if (tokenStream != null) { try { tokenStream.end(); tokenStream.close(); } catch (Exception e) { } } } }
From source file:edu.mit.ll.vizlinc.highlight.QueryScorer.java
License:Apache License
public TokenStream init(TokenStream tokenStream) throws IOException { position = -1;//from w w w .ja va2 s .co m termAtt = tokenStream.addAttribute(CharTermAttribute.class); posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); if (!skipInitExtractor) { if (fieldWeightedSpanTerms != null) { fieldWeightedSpanTerms.clear(); } return initExtractor(tokenStream); } return null; }
From source file:edu.mit.ll.vizlinc.highlight.QueryTermScorer.java
License:Apache License
public TokenStream init(TokenStream tokenStream) { termAtt = tokenStream.addAttribute(CharTermAttribute.class); return null; }
From source file:edu.mit.ll.vizlinc.highlight.SimpleFragmenter.java
License:Apache License
public void start(String originalText, TokenStream stream) { offsetAtt = stream.addAttribute(OffsetAttribute.class); currentNumFrags = 1;// w w w .ja v a 2 s . c o m }
From source file:edu.mit.ll.vizlinc.highlight.SimpleSpanFragmenter.java
License:Apache License
public void start(String originalText, TokenStream tokenStream) { position = -1;//from w w w . j a va 2s. co m currentNumFrags = 1; textSize = originalText.length(); termAtt = tokenStream.addAttribute(CharTermAttribute.class); posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); }
From source file:edu.stanford.rad.naivebayes.ClassifyLines.java
License:Apache License
public static void main(String[] args) throws Exception { // if (args.length < 5) { // System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); // return; // }//ww w. jav a 2s. c o m // String modelPath = args[0]; // String labelIndexPath = args[1]; // String dictionaryPath = args[2]; // String documentFrequencyPath = args[3]; // String tweetsPath = args[4]; String modelPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb"; String labelIndexPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/classification/nb/labelindex"; String dictionaryPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/dictionary.file-0"; String documentFrequencyPath = "/Users/saeedhp/Dropbox/Stanford/Code/NER/files/stride/ectopicPregnancy/vectors/TFIDFsparseSeqdir/df-count/part-r-00000"; String tweetsPath = "/Users/saeedhp/Desktop/tweet/tweet.txt"; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_46); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; System.out.println("Tweet: " + tweetId + "\t" + tweet); Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // Fixed error : close ts:TokenStream ts.end(); ts.close(); // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } System.out.print(" " + labels.get(categoryId) + ": " + score); } System.out.println(" => " + labels.get(bestCategoryId)); } analyzer.close(); reader.close(); }
From source file:filters.indexing.IndexableFilter.java
License:Open Source License
/** * Constructor for class IndexableFilter * @param input/*from w w w . ja va 2s.c o m*/ */ public IndexableFilter(TokenStream input, boolean set_synset_terms) { super(input); // Getting attributes from input token stream input_term = input.getAttribute(TermAttribute.class); input_type = input.getAttribute(TypeAttribute.class); input_flags = input.getAttribute(FlagsAttribute.class); input_payload = input.getAttribute(PayloadAttribute.class); // Setting attributes for this token stream output_term = this.addAttribute(TermAttribute.class); output_type = this.addAttribute(TypeAttribute.class); output_flags = this.addAttribute(FlagsAttribute.class); output_payload = input.addAttribute(PayloadAttribute.class); this.set_synset_terms = set_synset_terms; }
From source file:fr.ericlab.sondy.core.DataManipulation.java
License:Open Source License
public void prepareStream(String datasetName, int intervalDuration, int ngram, String stemLanguage, boolean lemmatization, AppVariables appVariables) { try {//w w w . ja va 2s . com Connection connection; Class.forName("com.mysql.jdbc.Driver").newInstance(); connection = DriverManager.getConnection("jdbc:mysql://" + appVariables.configuration.getHost(), appVariables.configuration.getUsername(), appVariables.configuration.getPassword()); Statement statement = connection.createStatement(); Statement statement2 = connection.createStatement(); String lemStr = (lemmatization) ? "_lem1" : "_lem0"; statement.executeUpdate("CREATE TABLE " + appVariables.configuration.getSchema() + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram + "gram ( id INT NOT NULL AUTO_INCREMENT PRIMARY KEY, msg_author VARCHAR(100), msg_post_time TIMESTAMP, msg_text VARCHAR(600), time_slice INT)ENGINE=myisam;"); // statement.executeUpdate("CREATE INDEX index_time ON "+appVariables.configuration.getSchema()+"."+datasetName+"_messages (msg_post_time)"); ResultSet rsTMin = statement.executeQuery("select min(msg_post_time) from " + appVariables.configuration.getSchema() + "." + datasetName + "_messages;"); rsTMin.next(); Timestamp tMin = rsTMin.getTimestamp(1); ResultSet rsTMax = statement.executeQuery("select max(msg_post_time) from " + appVariables.configuration.getSchema() + "." + datasetName + "_messages;"); rsTMax.next(); Timestamp tMax = rsTMax.getTimestamp(1); Timestamp tRef = new Timestamp(0); long base = (tMin.getTime() - tRef.getTime()) * 1L; long streamDuration = (tMax.getTime() - tMin.getTime()) * 1L; long streamDurationMin = (streamDuration / 1000) / 60; String path = appVariables.configuration.getWorkspace() + "/datasets/" + datasetName + "/" + intervalDuration + "min-" + stemLanguage; path += (lemmatization) ? "-lem1" : "-lem0"; path += "-" + ngram + "gram"; String pathMention = path + "-m"; FSDirectory indexGlobal = FSDirectory.open(new File(path)); FSDirectory indexMention = FSDirectory.open(new File(pathMention)); Analyzer analyzer; Properties props = new Properties(); props.put("annotators", "tokenize,ssplit,parse,lemma"); StanfordCoreNLP pipeline = new StanfordCoreNLP(props); Annotation annotation; if (stemLanguage.equalsIgnoreCase("Standard")) { analyzer = new StandardAnalyzer(Version.LUCENE_36); } else { Class cl; if (stemLanguage.equals("Chinese")) { analyzer = new SmartChineseAnalyzer(Version.LUCENE_36); } else { String packageName = stemLanguage.substring(0, 2).toLowerCase(); cl = Class .forName("org.apache.lucene.analysis." + packageName + "." + stemLanguage + "Analyzer"); Class[] types = new Class[] { Version.class, Set.class }; Constructor ct = cl.getConstructor(types); analyzer = (Analyzer) ct.newInstance(Version.LUCENE_36, appVariables.currentStopWords.getSet()); } } IndexWriterConfig configGlobal; IndexWriterConfig configMention; ShingleAnalyzerWrapper shingleAnalyzer = null; if (ngram > 1) { shingleAnalyzer = new ShingleAnalyzerWrapper(analyzer, ngram, ngram, " ", false, false); WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer(Version.LUCENE_36); configGlobal = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer); configMention = new IndexWriterConfig(Version.LUCENE_36, whitespaceAnalyzer); } else { configGlobal = new IndexWriterConfig(Version.LUCENE_36, analyzer); configMention = new IndexWriterConfig(Version.LUCENE_36, analyzer); } IndexWriter wGlobal = new IndexWriter(indexGlobal, configGlobal); IndexWriter wMention = new IndexWriter(indexMention, configMention); int docId = 0; for (int i = 0; i < streamDurationMin; i += intervalDuration) { statement = connection.createStatement(); long infBound = base + i * 60 * 1000L; long supBound = base + (i + intervalDuration) * 60 * 1000L; Timestamp infTime = new Timestamp(infBound); Timestamp supTime = new Timestamp(supBound); ResultSet rs = statement.executeQuery("SELECT msg_text, msg_post_time, msg_author FROM " + appVariables.configuration.getSchema() + "." + datasetName + "_messages WHERE msg_post_time>'" + infTime + "' AND msg_post_time< '" + supTime + "'"); String globalContent = new String(); String mentionContent = new String(); String timestamps = new String(); NumberFormat formatter = new DecimalFormat("00000000"); int bulk = 0; String bulkString = ""; boolean mention; while (rs.next()) { String message = rs.getString(1).toLowerCase(); mention = message.contains("@"); if (lemmatization) { annotation = new Annotation(message); message = ""; pipeline.annotate(annotation); List<CoreMap> lem = annotation.get(SentencesAnnotation.class); for (CoreMap l : lem) { for (CoreLabel token : l.get(TokensAnnotation.class)) { message += token.get(LemmaAnnotation.class) + " "; } } } if (ngram > 1) { String processedMessage = ""; TokenStream tokenStream = shingleAnalyzer.tokenStream("text", new StringReader(message)); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); while (tokenStream.incrementToken()) { String termToken = charTermAttribute.toString(); if (!termToken.contains("_")) { processedMessage += termToken.replace(" ", "=") + " "; } } message = processedMessage; } bulk++; if (bulk < _BULK_SIZE_) { bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\"" + rs.getString(3) + "\"),"; } else { bulk = 0; bulkString += " (" + docId + ",'" + rs.getString(2) + "',\"" + message + "\",\"" + rs.getString(3) + "\");"; statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES" + bulkString); bulkString = ""; } globalContent += message + "\n"; if (mention) { mentionContent += message + "\n"; } timestamps += rs.getString(2) + "\n"; } if (bulk > 0 && bulkString.length() > 0) { statement2.executeUpdate("INSERT INTO " + appVariables.configuration.getSchema() + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram + "gram (time_slice,msg_post_time,msg_text,msg_author) VALUES" + bulkString.substring(0, bulkString.length() - 1) + ";"); } Document docGlobal = new Document(); docGlobal.add(new Field("content", globalContent, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); docGlobal.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED)); wGlobal.addDocument(docGlobal); wGlobal.commit(); Document docMention = new Document(); docMention.add(new Field("content", mentionContent, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)); docMention.add(new Field("id", Integer.toString(docId), Field.Store.YES, Field.Index.NOT_ANALYZED)); wMention.addDocument(docMention); wMention.commit(); File textFile = new File(path + "/input/" + formatter.format(docId) + ".text"); FileUtils.writeStringToFile(textFile, globalContent); File timeFile = new File(path + "/input/" + formatter.format(docId) + ".time"); FileUtils.writeStringToFile(timeFile, timestamps); docId++; statement.close(); } statement2.executeUpdate("CREATE INDEX index_time_slice ON " + appVariables.configuration.getSchema() + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram + "gram (time_slice);"); statement2.executeUpdate("CREATE FULLTEXT INDEX index_text ON " + appVariables.configuration.getSchema() + "." + datasetName + "_" + intervalDuration + "min_" + stemLanguage + lemStr + "_" + ngram + "gram (msg_text);"); statement2.close(); connection.close(); wGlobal.close(); wMention.close(); } catch (IOException ex) { Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex); } catch (SQLException | InstantiationException | IllegalAccessException | ClassNotFoundException ex) { Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex); } catch (NoSuchMethodException | SecurityException | IllegalArgumentException | InvocationTargetException ex) { Logger.getLogger(DataManipulation.class.getName()).log(Level.SEVERE, null, ex); } }
From source file:fr.inrialpes.exmo.ontosim.string.CommonWords.java
License:Open Source License
private void extractTerms(String e) { Set<String> s = new LinkedHashSet<String>(); TokenStream ts = analyzer.tokenStream("", new StringReader(e)); TermAttribute termAtt = ts.addAttribute(TermAttribute.class); try {// w ww .j a va 2 s . co m while (ts.incrementToken()) { s.add(termAtt.term()); } } catch (IOException ex) { ex.printStackTrace(); } /* Token token; try { while ((token = ts.next()) != null) { s.add(token.termText()); } } catch (IOException ex) { ex.printStackTrace(); } */ map.put(e, s); }
From source file:fr.inrialpes.exmo.ontosim.string.JWNLDistances.java
License:Open Source License
/** * Takes a gloss-like string (text) and returns it tokenized. * with:/*from w ww. j a v a 2 s .c o m*/ * - stopwords * - lower case * - porter stemmer */ protected Set<String> tokenizeGloss(String s) throws IOException { Set<String> result = new HashSet<String>(); // I am affraid that I am reimplementing the StandardAnalizer... TokenStream ts = new PorterStemFilter( new StopFilter(true, new LowerCaseTokenizer(new StringReader(s)), stopWords, true)); TermAttribute termAtt = ts.addAttribute(TermAttribute.class); while (ts.incrementToken()) { result.add(termAtt.term()); } return result; }