List of usage examples for org.apache.lucene.analysis TokenStream addAttribute
public final <T extends Attribute> T addAttribute(Class<T> attClass)
From source file:lia.chapter4.AnalyzerUtils.java
License:Apache License
public static void displayPositionIncrements(Analyzer analyzer, String text) throws IOException { TokenStream stream = analyzer.tokenStream("text", text); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); while (stream.incrementToken()) { System.out.println("posIncr=" + posIncr.getPositionIncrement()); }// w w w . j a v a 2 s. c om }
From source file:lia.chapter4.SimpleAnalyzer.java
License:Apache License
public static void main(String[] args) throws IOException { TokenStream tokenStream = new SimpleAnalyzer().tokenStream("text", "The quick brown fox.."); OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset();/*ww w .j a va2 s .co m*/ while (tokenStream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); System.out.println(startOffset); int endOffset = offsetAttribute.endOffset(); System.out.println(endOffset); String term = charTermAttribute.toString(); System.out.println(term); } /*AnalyzerUtils.displayTokensWithFullDetails(new SimpleAnalyzer(), "The quick brown fox....");*/ }
From source file:lucandra.IndexWriter.java
License:Apache License
@SuppressWarnings("unchecked") public void addDocument(Document doc, Analyzer analyzer) throws CorruptIndexException, IOException { List<String> allIndexedTerms = new ArrayList<String>(); // check for special field name String docId = doc.get(CassandraUtils.documentIdField); if (docId == null) docId = Long.toHexString((long) (System.nanoTime() + (Math.random() * System.nanoTime()))); int position = 0; for (Fieldable field : (List<Fieldable>) doc.getFields()) { // Untokenized fields go in without a termPosition if (field.isIndexed() && !field.isTokenized()) { String term = CassandraUtils.createColumnName(field.name(), field.stringValue()); allIndexedTerms.add(term);//from www. j a v a 2s .co m String key = indexName + CassandraUtils.delimeter + term; Map<String, List<Number>> termMap = new HashMap<String, List<Number>>(); termMap.put(CassandraUtils.termFrequencyKey, CassandraUtils.emptyArray); termMap.put(CassandraUtils.positionVectorKey, CassandraUtils.emptyArray); CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily, docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, termMap); } else if (field.isIndexed()) { TokenStream tokens = field.tokenStreamValue(); if (tokens == null) { tokens = analyzer.tokenStream(field.name(), new StringReader(field.stringValue())); } // collect term information per field Map<String, Map<String, List<Number>>> allTermInformation = new HashMap<String, Map<String, List<Number>>>(); int lastOffset = 0; if (position > 0) { position += analyzer.getPositionIncrementGap(field.name()); } // Build the termPositions vector for all terms tokens.reset(); // reset the TokenStream to the first token // set up token attributes we are working on // offsets OffsetAttribute offsetAttribute = null; if (field.isStoreOffsetWithTermVector()) offsetAttribute = (OffsetAttribute) tokens.addAttribute(OffsetAttribute.class); // positions PositionIncrementAttribute posIncrAttribute = null; if (field.isStorePositionWithTermVector()) posIncrAttribute = (PositionIncrementAttribute) tokens .addAttribute(PositionIncrementAttribute.class); TermAttribute termAttribute = (TermAttribute) tokens.addAttribute(TermAttribute.class); // store normalizations of field per term per document rather // than per field. // this adds more to write but less to read on other side Integer tokensInField = new Integer(0); while (tokens.incrementToken()) { tokensInField++; String term = CassandraUtils.createColumnName(field.name(), termAttribute.term()); allIndexedTerms.add(term); // fetch all collected information for this term Map<String, List<Number>> termInfo = allTermInformation.get(term); if (termInfo == null) { termInfo = new HashMap<String, List<Number>>(); allTermInformation.put(term, termInfo); } // term frequency { List<Number> termFrequency = termInfo.get(CassandraUtils.termFrequencyKey); if (termFrequency == null) { termFrequency = new ArrayList<Number>(); termFrequency.add(new Integer(0)); termInfo.put(CassandraUtils.termFrequencyKey, termFrequency); } // increment termFrequency.set(0, termFrequency.get(0).intValue() + 1); } // position vector if (field.isStorePositionWithTermVector()) { position += (posIncrAttribute.getPositionIncrement() - 1); List<Number> positionVector = termInfo.get(CassandraUtils.positionVectorKey); if (positionVector == null) { positionVector = new ArrayList<Number>(); termInfo.put(CassandraUtils.positionVectorKey, positionVector); } positionVector.add(++position); } // term offsets if (field.isStoreOffsetWithTermVector()) { List<Number> offsetVector = termInfo.get(CassandraUtils.offsetVectorKey); if (offsetVector == null) { offsetVector = new ArrayList<Number>(); termInfo.put(CassandraUtils.offsetVectorKey, offsetVector); } offsetVector.add(lastOffset + offsetAttribute.startOffset()); offsetVector.add(lastOffset + offsetAttribute.endOffset()); } } List<Number> bnorm = null; if (!field.getOmitNorms()) { bnorm = new ArrayList<Number>(); float norm = doc.getBoost(); norm *= field.getBoost(); norm *= similarity.lengthNorm(field.name(), tokensInField); bnorm.add(Similarity.encodeNorm(norm)); } for (Map.Entry<String, Map<String, List<Number>>> term : allTermInformation.entrySet()) { // Terms are stored within a unique key combination // This is required since cassandra loads all columns // in a key/column family into memory String key = indexName + CassandraUtils.delimeter + term.getKey(); // Mix in the norm for this field alongside each term // more writes but faster on read side. if (!field.getOmitNorms()) { term.getValue().put(CassandraUtils.normsKey, bnorm); } CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.termVecColumnFamily, docId.getBytes("UTF-8"), CassandraUtils.hashKey(key), null, term.getValue()); } } // Stores each field as a column under this doc key if (field.isStored()) { byte[] _value = field.isBinary() ? field.getBinaryValue() : field.stringValue().getBytes("UTF-8"); // first byte flags if binary or not byte[] value = new byte[_value.length + 1]; System.arraycopy(_value, 0, value, 0, _value.length); value[value.length - 1] = (byte) (field.isBinary() ? Byte.MAX_VALUE : Byte.MIN_VALUE); String key = indexName + CassandraUtils.delimeter + docId; CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily, field.name().getBytes("UTF-8"), CassandraUtils.hashKey(key), value, null); } } // Finally, Store meta-data so we can delete this document String key = indexName + CassandraUtils.delimeter + docId; CassandraUtils.addToMutationMap(getMutationMap(), CassandraUtils.docColumnFamily, CassandraUtils.documentMetaField.getBytes("UTF-8"), CassandraUtils.hashKey(key), CassandraUtils.toBytes(allIndexedTerms), null); if (autoCommit) CassandraUtils.robustBatchInsert(client, getMutationMap()); }
From source file:lucli.LuceneMethods.java
License:Apache License
private void invertDocument(Document doc) throws IOException { Map tokenMap = new HashMap(); final int maxFieldLength = 10000; Analyzer analyzer = createAnalyzer(); Iterator fields = doc.getFields().iterator(); final Token reusableToken = new Token(); while (fields.hasNext()) { Field field = (Field) fields.next(); String fieldName = field.name(); if (field.isIndexed()) { if (field.isTokenized()) { // un-tokenized field Reader reader; // find or make Reader if (field.readerValue() != null) reader = field.readerValue(); else if (field.stringValue() != null) reader = new StringReader(field.stringValue()); else throw new IllegalArgumentException("field must have either String or Reader value"); int position = 0; // Tokenize field and add to postingTable TokenStream stream = analyzer.tokenStream(fieldName, reader); TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class); PositionIncrementAttribute posIncrAtt = (PositionIncrementAttribute) stream .addAttribute(PositionIncrementAttribute.class); try { while (stream.incrementToken()) { position += (posIncrAtt.getPositionIncrement() - 1); position++;/*from w w w .ja v a 2 s . c om*/ String name = termAtt.term(); Integer Count = (Integer) tokenMap.get(name); if (Count == null) { // not in there yet tokenMap.put(name, new Integer(1)); //first one } else { int count = Count.intValue(); tokenMap.put(name, new Integer(count + 1)); } if (position > maxFieldLength) break; } } finally { stream.close(); } } } } Entry[] sortedHash = getSortedMapEntries(tokenMap); for (int ii = 0; ii < sortedHash.length && ii < 10; ii++) { Entry currentEntry = sortedHash[ii]; message((ii + 1) + ":" + currentEntry.getKey() + " " + currentEntry.getValue()); } }
From source file:mahout.classifier.Classifier.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]"); return;/*from w ww. j a v a2 s .c o m*/ } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tweetsPath = args[4]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); BufferedReader reader = new BufferedReader(new FileReader(tweetsPath)); while (true) { String line = reader.readLine(); if (line == null) { break; } String[] tokens = line.split("\t", 2); String tweetId = tokens[0]; String tweet = tokens[1]; Multiset<String> words = ConcurrentHashMultiset.create(); // extract words from tweet TokenStream ts = analyzer.tokenStream("text", new StringReader(tweet)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); // if the word is not in the dictionary, skip it if (wordId != null) { words.add(word); wordCount++; } } } // create vector wordId => weight using tfidf Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } // With the classifier, we get one score for each label // The label with the highest score is the one the tweet is more likely to // be associated to Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } System.out.println(labels.get(bestCategoryId) + "\t" + tweet); } analyzer.close(); reader.close(); }
From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java
License:Apache License
public static void analyzerByStop(String sentence) throws Exception { Analyzer analyzer = new StopAnalyzer(); TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset();/*from www .j a v a 2s . c om*/ while (tokenStream.incrementToken()) { CharTermAttribute charTermAttribute = (CharTermAttribute) tokenStream .getAttribute(CharTermAttribute.class); System.out.print(charTermAttribute.toString() + " ,"); } analyzer.close(); }
From source file:me.smoe.adar.analyzer.luence.AnalyzerToy.java
License:Apache License
public static Set<String> analyzerByStandard(String sentence) throws Exception { Analyzer analyzer = new StandardAnalyzer(); try {/*from w ww. ja v a2 s. c om*/ TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); Set<String> words = new HashSet<>(); while (tokenStream.incrementToken()) { words.add(((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString()); } return words; } finally { analyzer.close(); } }
From source file:me.smoe.adar.utils.cam.o.common.SentenceAnalyzer.java
License:Apache License
public static Set<String> analyzer(String sentence) throws Exception { if (StringUtils.isEmpty(sentence)) { return Collections.emptySet(); }//from w ww . j a v a 2s . c om Analyzer analyzer = new StandardAnalyzer(); try { TokenStream tokenStream = analyzer.tokenStream(StringUtils.EMPTY, new StringReader(sentence)); tokenStream.addAttribute(CharTermAttribute.class); tokenStream.reset(); Set<String> words = new LinkedHashSet<>(); while (tokenStream.incrementToken()) { String word = ((CharTermAttribute) tokenStream.getAttribute(CharTermAttribute.class)).toString(); if (word.length() <= 1) { continue; } words.add(word); } return words; } finally { analyzer.close(); } }
From source file:modnlp.idx.inverted.TokeniserJPLucene.java
License:Open Source License
public void tokenise() throws IOException { String ignregexp = "--+|\\.\\.+|\\.+\\p{Space}"; // delete full stops and dashes (typically not used). if (ignoredElements != null && ignoredElements.length() > 0) ignregexp = ignregexp + "|< *" + ignoredElements + "[^>]*?/>" + "|< *" + ignoredElements + ".*?>.*?</" + ignoredElements + " *>"; if (!tagIndexing) ignregexp = ignregexp + "|<.*?>"; //ignregexp = ignregexp+"|\\W\\W+"; Pattern p = Pattern.compile(ignregexp); Matcher igns = p.matcher(originalText); StringBuffer tx = new StringBuffer(originalText); int ct = 1;/* ww w. j ava 2s . c o m*/ while (igns.find()) { int s = igns.start(); int e = igns.end(); if (verbose) PrintUtil.printNoMove("Processing exclusions ...", ct++); //System.err.println("replacing\n-----------"+originalText.substring(s,e)+"\n--------------"); char sp[] = new char[e - s]; for (int j = 0; j < sp.length; j++) { sp[j] = ' '; } tx.replace(s, e, new String(sp)); } if (verbose) PrintUtil.donePrinting(); ct = 1; //verbose = false; String text = new String(tx); //System.out.println("-->"+text+"<--"); Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(text), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); //stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); //stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); tokenMap.putPos(token, startOffset); //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } if (verbose) PrintUtil.donePrinting(); ct = 1; }
From source file:modnlp.idx.inverted.TokeniserJPLucene.java
License:Open Source License
public List<String> split(String s) { ArrayList<String> ret = new ArrayList<String>(); try {/*ww w . j av a2 s .c o m*/ Tokenizer tokenizer = new JapaneseTokenizer(new StringReader(s), null, true, org.apache.lucene.analysis.ja.JapaneseTokenizer.Mode.SEARCH); TokenStream stream = new JapaneseBaseFormFilter(tokenizer); //stream = new JapanesePartOfSpeechStopFilter(true, stream, stoptags); stream = new CJKWidthFilter(stream); //stream = new StopFilter(matchVersion, stream, stopwords); stream = new JapaneseKatakanaStemFilter(stream); //stream = new LowerCaseFilter(matchVersion, stream); OffsetAttribute offsetAttribute = stream.addAttribute(OffsetAttribute.class); CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class); while (stream.incrementToken()) { int startOffset = offsetAttribute.startOffset(); int endOffset = offsetAttribute.endOffset(); String token = charTermAttribute.toString(); ret.add(token); //System.out.println(token.str+" \t\tS="+token.start+" E="+token.end); } } catch (java.io.IOException e) { System.err.println(e); } return ret; }