List of usage examples for org.apache.lucene.analysis.shingle ShingleFilter incrementToken
@Override
public boolean incrementToken() throws IOException
From source file:com.elex.dmp.vectorizer.TFPartialVectorReducer.java
License:Apache License
@Override protected void reduce(Text key, Iterable<StringTuple> values, Context context) throws IOException, InterruptedException { Iterator<StringTuple> it = values.iterator(); if (!it.hasNext()) { return;/*from ww w .j a va 2s . com*/ } StringTuple value = it.next(); Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size if (maxNGramSize >= 2) { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize); try { do { String term = sf.getAttribute(CharTermAttribute.class).toString(); if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } while (sf.incrementToken()); sf.end(); } finally { Closeables.closeQuietly(sf); } } else { for (String term : value.getEntries()) { if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } } if (sequentialAccess) { vector = new SequentialAccessSparseVector(vector); } if (namedVector) { vector = new NamedVector(vector, key.toString()); } // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk. if (vector.getNumNondefaultElements() > 0) { VectorWritable vectorWritable = new VectorWritable(vector); context.write(key, vectorWritable); } else { context.getCounter("TFParticalVectorReducer", "emptyVectorCount").increment(1); } }
From source file:edu.isi.pfindr.learn.model.Shingles.java
License:Apache License
public static List<String> computeShingles(String data) { //System.out.println("I an here"); data = data.toLowerCase();/* w w w .j av a 2 s .co m*/ List<String> shingleList = new ArrayList<String>(); //System.out.println("DATA inside expandWithDictionaryForShingles "+ data); try { Tokenizer analyzer = new Tokenizer(Version.LUCENE_30); TokenStream tokenStream = analyzer.tokenStream("", new StringReader(data)); ShingleFilter filter = new ShingleFilter(tokenStream, 4); filter.setOutputUnigrams(false); TermAttribute termAtt = (TermAttribute) filter.getAttribute(TermAttribute.class); //System.out.print("Printing the shingles "); while (filter.incrementToken()) { shingleList.add(termAtt.term().trim()); //.replaceAll("_", " ").replaceAll("\\s+", " ").trim()); //System.out.print(termAtt.term()+ "\t"); } } catch (Exception e) { e.printStackTrace(); } logger.info("Shingle List size returned: " + shingleList.size()); return shingleList; }
From source file:edu.rosehulman.CollocMapper.java
License:Apache License
/** * Collocation finder: pass 1 map phase. * <p/>//from w w w.j a va 2 s. com * Receives a token stream which gets passed through a Lucene ShingleFilter. The ShingleFilter delivers ngrams of * the appropriate size which are then decomposed into head and tail subgrams which are collected in the * following manner * <p/> * <pre> * k:head_key, v:head_subgram * k:head_key,ngram_key, v:ngram * k:tail_key, v:tail_subgram * k:tail_key,ngram_key, v:ngram * </pre> * <p/> * The 'head' or 'tail' prefix is used to specify whether the subgram in question is the head or tail of the * ngram. In this implementation the head of the ngram is a (n-1)gram, and the tail is a (1)gram. * <p/> * For example, given 'click and clack' and an ngram length of 3: * <pre> * k: head_'click and' v:head_'click and' * k: head_'click and',ngram_'click and clack' v:ngram_'click and clack' * k: tail_'clack', v:tail_'clack' * k: tail_'clack',ngram_'click and clack' v:ngram_'click and clack' * </pre> * <p/> * Also counts the total number of ngrams encountered and adds it to the counter * CollocDriver.Count.NGRAM_TOTAL * </p> * * @throws IOException if there's a problem with the ShingleFilter reading data or the collector collecting output. */ @Override protected void map(Text key, StringTuple value, final Context context) throws IOException, InterruptedException { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxShingleSize); sf.reset(); try { int count = 0; // ngram count OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>( value.getEntries().size() * (maxShingleSize - 1)); OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size()); do { String term = sf.getAttribute(CharTermAttribute.class).toString(); String type = sf.getAttribute(TypeAttribute.class).type(); if ("shingle".equals(type)) { count++; ngrams.adjustOrPutValue(term, 1, 1); } else if (emitUnigrams && !term.isEmpty()) { // unigram unigrams.adjustOrPutValue(term, 1, 1); } } while (sf.incrementToken()); final GramKey gramKey = new GramKey(); ngrams.forEachPair(new ObjectIntProcedure<String>() { public boolean apply(String term, int frequency) { // obtain components, the leading (n-1)gram and the trailing unigram. int i = term.lastIndexOf(' '); // TODO: fix for non-whitespace delimited languages. if (i != -1) { // bigram, trigram etc try { Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM); Gram head = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD); Gram tail = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL); gramKey.set(head, EMPTY); context.write(gramKey, head); gramKey.set(head, ngram.getBytes()); context.write(gramKey, ngram); gramKey.set(tail, EMPTY); context.write(gramKey, tail); gramKey.set(tail, ngram.getBytes()); context.write(gramKey, ngram); } catch (IOException e) { throw new IllegalStateException(e); } catch (InterruptedException e) { throw new IllegalStateException(e); } } return true; } }); unigrams.forEachPair(new ObjectIntProcedure<String>() { public boolean apply(String term, int frequency) { try { Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM); gramKey.set(unigram, EMPTY); context.write(gramKey, unigram); } catch (IOException e) { throw new IllegalStateException(e); } catch (InterruptedException e) { throw new IllegalStateException(e); } return true; } }); context.getCounter(Count.NGRAM_TOTAL).increment(count); sf.end(); } finally { Closeables.close(sf, true); } }
From source file:edu.rosehulman.TFPartialVectorReducer.java
License:Apache License
@Override protected void reduce(Text key, Iterable<StringTuple> values, Context context) throws IOException, InterruptedException { Iterator<StringTuple> it = values.iterator(); if (!it.hasNext()) { return;//from w w w.j a va 2s.c o m } StringTuple value = it.next(); Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size if (maxNGramSize >= 2) { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize); sf.reset(); try { do { String term = sf.getAttribute(CharTermAttribute.class).toString(); if (!term.isEmpty() && dictionary.containsKey(term)) { // ngram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } while (sf.incrementToken()); sf.end(); } finally { Closeables.close(sf, true); } } else { for (String term : value.getEntries()) { if (!term.isEmpty() && dictionary.containsKey(term)) { // unigram int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } } if (sequentialAccess) { vector = new SequentialAccessSparseVector(vector); } if (namedVector) { vector = new NamedVector(vector, key.toString()); } // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk. if (vector.getNumNondefaultElements() > 0) { VectorWritable vectorWritable = new VectorWritable(vector); context.write(key, vectorWritable); } else { context.getCounter("TFPartialVectorReducer", "emptyVectorCount").increment(1); } }
From source file:org.apache.mahout.classifier.bayes.mapreduce.common.BayesFeatureMapper.java
License:Apache License
/** * We need to count the number of times we've seen a term with a given label and we need to output that. But * this Mapper does more than just outputing the count. It first does weight normalisation. Secondly, it * outputs for each unique word in a document value 1 for summing up as the Term Document Frequency. Which * later is used to calculate the Idf Thirdly, it outputs for each label the number of times a document was * seen(Also used in Idf Calculation)// w w w .j ava 2 s.c om * * @param key * The label * @param value * the features (all unique) associated w/ this label in stringtuple format * @param output * The OutputCollector to write the results to * @param reporter * Not used */ @Override public void map(Text key, Text value, final OutputCollector<StringTuple, DoubleWritable> output, Reporter reporter) throws IOException { // String line = value.toString(); final String label = key.toString(); String[] tokens = SPACE_PATTERN.split(value.toString()); OpenObjectIntHashMap<String> wordList = new OpenObjectIntHashMap<String>(tokens.length * gramSize); if (gramSize > 1) { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(new ArrayIterator<String>(tokens)), gramSize); do { String term = (sf.getAttribute(TermAttribute.class)).term(); if (term.length() > 0) { if (wordList.containsKey(term)) { wordList.put(term, 1 + wordList.get(term)); } else { wordList.put(term, 1); } } } while (sf.incrementToken()); } else { for (String term : tokens) { if (wordList.containsKey(term)) { wordList.put(term, 1 + wordList.get(term)); } else { wordList.put(term, 1); } } } final MutableDouble lengthNormalisationMut = new MutableDouble(0.0); wordList.forEachPair(new ObjectIntProcedure<String>() { @Override public boolean apply(String word, int dKJ) { lengthNormalisationMut.add(dKJ * dKJ); return true; } }); final double lengthNormalisation = Math.sqrt(lengthNormalisationMut.doubleValue()); // Output Length Normalized + TF Transformed Frequency per Word per Class // Log(1 + D_ij)/SQRT( SIGMA(k, D_kj) ) wordList.forEachPair(new ObjectIntProcedure<String>() { @Override public boolean apply(String token, int dKJ) { try { StringTuple tuple = new StringTuple(); tuple.add(BayesConstants.WEIGHT); tuple.add(label); tuple.add(token); DoubleWritable f = new DoubleWritable(Math.log(1.0 + dKJ) / lengthNormalisation); output.collect(tuple, f); } catch (IOException e) { throw new IllegalStateException(e); } return true; } }); reporter.setStatus("Bayes Feature Mapper: Document Label: " + label); // Output Document Frequency per Word per Class // Corpus Document Frequency (FEATURE_COUNT) // Corpus Term Frequency (FEATURE_TF) wordList.forEachPair(new ObjectIntProcedure<String>() { @Override public boolean apply(String token, int dKJ) { try { StringTuple dfTuple = new StringTuple(); dfTuple.add(BayesConstants.DOCUMENT_FREQUENCY); dfTuple.add(label); dfTuple.add(token); output.collect(dfTuple, ONE); StringTuple tokenCountTuple = new StringTuple(); tokenCountTuple.add(BayesConstants.FEATURE_COUNT); tokenCountTuple.add(token); output.collect(tokenCountTuple, ONE); StringTuple tokenTfTuple = new StringTuple(); tokenTfTuple.add(BayesConstants.FEATURE_TF); tokenTfTuple.add(token); output.collect(tokenTfTuple, new DoubleWritable(dKJ)); } catch (IOException e) { throw new IllegalStateException(e); } return true; } }); // output that we have seen the label to calculate the Count of Document per // class StringTuple labelCountTuple = new StringTuple(); labelCountTuple.add(BayesConstants.LABEL_COUNT); labelCountTuple.add(label); output.collect(labelCountTuple, ONE); }
From source file:org.apache.mahout.utils.nlp.collocations.llr.CollocMapper.java
License:Apache License
/** * Collocation finder: pass 1 map phase. * <p/>// w ww . j av a 2 s . c o m * Receives a token stream which gets passed through a Lucene ShingleFilter. The ShingleFilter delivers ngrams of * the appropriate size which are then decomposed into head and tail subgrams which are collected in the * following manner * * <pre> * k:head_key, v:head_subgram * k:head_key,ngram_key, v:ngram * k:tail_key, v:tail_subgram * k:tail_key,ngram_key, v:ngram * </pre> * * The 'head' or 'tail' prefix is used to specify whether the subgram in question is the head or tail of the * ngram. In this implementation the head of the ngram is a (n-1)gram, and the tail is a (1)gram. * <p/> * For example, given 'click and clack' and an ngram length of 3: * <pre> * k: head_'click and' v:head_'click and' * k: head_'click and',ngram_'click and clack' v:ngram_'click and clack' * k: tail_'clack', v:tail_'clack' * k: tail_'clack',ngram_'click and clack' v:ngram_'click and clack' * </pre> * * Also counts the total number of ngrams encountered and adds it to the counter * CollocDriver.Count.NGRAM_TOTAL * </p> * * @throws IOException * if there's a problem with the ShingleFilter reading data or the collector collecting output. */ @Override protected void map(Text key, StringTuple value, final Context context) throws IOException, InterruptedException { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxShingleSize); int count = 0; // ngram count OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>( value.getEntries().size() * (maxShingleSize - 1)); OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size()); do { String term = (sf.getAttribute(TermAttribute.class)).term(); String type = (sf.getAttribute(TypeAttribute.class)).type(); if ("shingle".equals(type)) { count++; ngrams.adjustOrPutValue(term, 1, 1); } else if (emitUnigrams && term.length() > 0) { // unigram unigrams.adjustOrPutValue(term, 1, 1); } } while (sf.incrementToken()); try { final GramKey gramKey = new GramKey(); ngrams.forEachPair(new ObjectIntProcedure<String>() { @Override public boolean apply(String term, int frequency) { // obtain components, the leading (n-1)gram and the trailing unigram. int i = term.lastIndexOf(' '); // TODO: fix for non-whitespace delimited languages. if (i != -1) { // bigram, trigram etc try { Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM); Gram head = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD); Gram tail = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL); gramKey.set(head, EMPTY); context.write(gramKey, head); gramKey.set(head, ngram.getBytes()); context.write(gramKey, ngram); gramKey.set(tail, EMPTY); context.write(gramKey, tail); gramKey.set(tail, ngram.getBytes()); context.write(gramKey, ngram); } catch (IOException e) { throw new IllegalStateException(e); } catch (InterruptedException e) { throw new IllegalStateException(e); } } return true; } }); unigrams.forEachPair(new ObjectIntProcedure<String>() { @Override public boolean apply(String term, int frequency) { try { Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM); gramKey.set(unigram, EMPTY); context.write(gramKey, unigram); } catch (IOException e) { throw new IllegalStateException(e); } catch (InterruptedException e) { throw new IllegalStateException(e); } return true; } }); } catch (IllegalStateException ise) { // catch an re-throw original exceptions from the procedures. if (ise.getCause() instanceof IOException) { throw (IOException) ise.getCause(); } else { // wasn't what was expected, so re-throw throw ise; } } context.getCounter(Count.NGRAM_TOTAL).increment(count); sf.end(); sf.close(); }
From source file:org.apache.mahout.utils.vectors.text.term.TFPartialVectorReducer.java
License:Apache License
@Override protected void reduce(Text key, Iterable<StringTuple> values, Context context) throws IOException, InterruptedException { Iterator<StringTuple> it = values.iterator(); if (!it.hasNext()) { return;// www .j a va 2 s. c om } StringTuple value = it.next(); Vector vector = new RandomAccessSparseVector(dimension, value.length()); // guess at initial size if (maxNGramSize >= 2) { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxNGramSize); do { String term = (sf.getAttribute(TermAttribute.class)).term(); if (term.length() > 0) { // ngram if (dictionary.containsKey(term)) { int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } } while (sf.incrementToken()); sf.end(); sf.close(); } else { for (String term : value.getEntries()) { if (term.length() > 0) { // unigram if (dictionary.containsKey(term)) { int termId = dictionary.get(term); vector.setQuick(termId, vector.getQuick(termId) + 1); } } } } if (sequentialAccess) { vector = new SequentialAccessSparseVector(vector); } // if the vector has no nonZero entries (nothing in the dictionary), let's not waste space sending it to disk. if (vector.getNumNondefaultElements() > 0) { VectorWritable vectorWritable = new VectorWritable(new NamedVector(vector, key.toString())); context.write(key, vectorWritable); } else { context.getCounter("TFParticalVectorReducer", "emptyVectorCount").increment(1); } }
From source file:org.apache.mahout.vectorizer.collocations.llr.CollocMapper.java
License:Apache License
/** * Collocation finder: pass 1 map phase. * <p/>/*from w ww . ja v a 2s . c o m*/ * Receives a token stream which gets passed through a Lucene ShingleFilter. The ShingleFilter delivers ngrams of * the appropriate size which are then decomposed into head and tail subgrams which are collected in the * following manner * <p/> * <pre> * k:head_key, v:head_subgram * k:head_key,ngram_key, v:ngram * k:tail_key, v:tail_subgram * k:tail_key,ngram_key, v:ngram * </pre> * <p/> * The 'head' or 'tail' prefix is used to specify whether the subgram in question is the head or tail of the * ngram. In this implementation the head of the ngram is a (n-1)gram, and the tail is a (1)gram. * <p/> * For example, given 'click and clack' and an ngram length of 3: * <pre> * k: head_'click and' v:head_'click and' * k: head_'click and',ngram_'click and clack' v:ngram_'click and clack' * k: tail_'clack', v:tail_'clack' * k: tail_'clack',ngram_'click and clack' v:ngram_'click and clack' * </pre> * <p/> * Also counts the total number of ngrams encountered and adds it to the counter * CollocDriver.Count.NGRAM_TOTAL * </p> * * @throws IOException if there's a problem with the ShingleFilter reading data or the collector collecting output. */ @Override protected void map(Text key, StringTuple value, final Context context) throws IOException, InterruptedException { ShingleFilter sf = new ShingleFilter(new IteratorTokenStream(value.getEntries().iterator()), maxShingleSize); sf.reset(); try { int count = 0; // ngram count OpenObjectIntHashMap<String> ngrams = new OpenObjectIntHashMap<String>( value.getEntries().size() * (maxShingleSize - 1)); OpenObjectIntHashMap<String> unigrams = new OpenObjectIntHashMap<String>(value.getEntries().size()); do { String term = sf.getAttribute(CharTermAttribute.class).toString(); String type = sf.getAttribute(TypeAttribute.class).type(); if ("shingle".equals(type)) { count++; ngrams.adjustOrPutValue(term, 1, 1); } else if (emitUnigrams && !term.isEmpty()) { // unigram unigrams.adjustOrPutValue(term, 1, 1); } } while (sf.incrementToken()); final GramKey gramKey = new GramKey(); ngrams.forEachPair(new ObjectIntProcedure<String>() { @Override public boolean apply(String term, int frequency) { // obtain components, the leading (n-1)gram and the trailing unigram. int i = term.lastIndexOf(' '); // TODO: fix for non-whitespace delimited languages. if (i != -1) { // bigram, trigram etc try { Gram ngram = new Gram(term, frequency, Gram.Type.NGRAM); Gram head = new Gram(term.substring(0, i), frequency, Gram.Type.HEAD); Gram tail = new Gram(term.substring(i + 1), frequency, Gram.Type.TAIL); gramKey.set(head, EMPTY); context.write(gramKey, head); gramKey.set(head, ngram.getBytes()); context.write(gramKey, ngram); gramKey.set(tail, EMPTY); context.write(gramKey, tail); gramKey.set(tail, ngram.getBytes()); context.write(gramKey, ngram); } catch (IOException e) { throw new IllegalStateException(e); } catch (InterruptedException e) { throw new IllegalStateException(e); } } return true; } }); unigrams.forEachPair(new ObjectIntProcedure<String>() { @Override public boolean apply(String term, int frequency) { try { Gram unigram = new Gram(term, frequency, Gram.Type.UNIGRAM); gramKey.set(unigram, EMPTY); context.write(gramKey, unigram); } catch (IOException e) { throw new IllegalStateException(e); } catch (InterruptedException e) { throw new IllegalStateException(e); } return true; } }); context.getCounter(Count.NGRAM_TOTAL).increment(count); sf.end(); } finally { Closeables.close(sf, true); } }