List of usage examples for org.apache.mahout.common StringTuple StringTuple
public StringTuple()
From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java
public void performWork(Path doc) throws IOException { try {//from w w w .j ava 2s .c om System.out.println("performing token work"); HashMap<Text, StringTuple> tokenized = new HashMap<>(); StringBuilder part = new StringBuilder(); // store the tokens of each doc for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB, ClusterFileService.CONF)) { String key = pair.getFirst().toString(); System.out.println(key); String value = pair.getSecond().toString(); part.append(key); TokenStream stream = analyzer.tokenStream(key, new StringReader(value)); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); stream.close(); tokenized.put(new Text(key), document); } // write the sequencefile Path tokenizedSeq = new Path(vectorsDir, part.toString()); try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS, ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) { for (Text k : tokenized.keySet()) { writer.append(k, tokenized.get(k)); } writer.close(); System.out.println("wrote"); } } catch (Exception e) { System.out.println(e.getMessage()); } finally { numThreads.decrementAndGet(); } }
From source file:com.clustertest2.clustertest2.vectorization.TokenBuilder.java
@Override public void performWork(Path inputDoc, Path outputDir) { try {/*w ww. ja v a 2 s .c om*/ HashMap<Text, StringTuple> tokenized = new HashMap<>(); StringBuilder docName = new StringBuilder(); // store the tokens of each doc for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(inputDoc, PathType.GLOB, ClusterFileService.CONF)) { String key = pair.getFirst().toString(); String value = pair.getSecond().toString(); docName.append(key); StringTuple document; try (TokenStream stream = analyzer.tokenStream(key, new StringReader(value))) { CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); } tokenized.put(new Text(key), document); } // write the sequencefile Path tokenizedSeq = new Path(outputDir, docName.toString()); // overwrite old vector file ClusterFileService.FS.delete(tokenizedSeq, true); try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS, ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) { for (Text k : tokenized.keySet()) { writer.append(k, tokenized.get(k)); } writer.close(); } } catch (IOException e) { System.out.println(e.getMessage()); } finally { numThreads.decrementAndGet(); } }
From source file:com.digitalpebble.behemoth.mahout.BehemothTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException { StringTuple document = new StringTuple(); Iterator<Annotation> iter = value.getAnnotations().iterator(); while (iter.hasNext()) { Annotation annot = iter.next(); // check the type if (!annot.getType().equals(tokenType)) continue; java.util.Map<String, String> features = annot.getFeatures(); if (features == null) continue; String featureValue = null; // no feature? use the underlying text if (tokenFeature.equals("")) { featureValue = value.getText().substring((int) annot.getStart(), (int) annot.getEnd()); } else/*from ww w. j a va 2 s . c o m*/ featureValue = features.get(tokenFeature); if (featureValue == null) continue; document.add(featureValue); } context.write(key, document); }
From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException { String sContent = value.getText(); if (sContent == null) { // no text available? skip context.getCounter("LuceneTokenizer", "BehemothDocWithoutText").increment(1); return;/*from w w w .j av a2 s. c o m*/ } // analyzer. TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(sContent.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); StringTuple document = new StringTuple(); stream.reset(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } context.write(key, document); }
From source file:com.gsvic.csmr.io.InputData.java
License:Apache License
/** * Reads the tokenized document/*from w w w.ja v a 2 s.com*/ * @param conf * @param input * @return Returns the document tokens (StringTuples) in a HashMap * @throws IOException */ public HashMap<Text, StringTuple> readTokenizedDocument(Configuration conf, Path input) throws IOException { FileSystem fs = FileSystem.get(conf); SequenceFile.Reader reader; reader = new SequenceFile.Reader(fs, input, conf); Text key = new Text(); StringTuple value = new StringTuple(); HashMap<Text, StringTuple> tokensMap = new HashMap<>(); while (reader.next(key, value)) { tokensMap.put(new Text(key), new StringTuple(value.getEntries())); } return tokensMap; }
From source file:com.ml.hadoop.nlp.SequenceFileTokenizerMapper.java
License:Apache License
@Override protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString())); stream.reset();/* w w w . j a v a 2 s . c o m*/ CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); stream.reset(); StringTuple document = new StringTuple(); while (stream.incrementToken()) { if (termAtt.length() > 0) { document.add(new String(termAtt.buffer(), 0, termAtt.length())); } } stream.end(); Closeables.close(stream, true); //drop stop words document = StopWordsHandler.dropStopWords(document); context.write(key, document); }
From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerMapper.java
License:Apache License
@Override public void map(Text key, Text value, Context context) throws IOException, InterruptedException { long initCPU = System.nanoTime(); TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(value.toString())); CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class); StringTuple document = new StringTuple(); stream.reset();//from w w w . j av a 2 s . c om while (stream.incrementToken()) { if (termAtt.length() > 0) { String term = new String(termAtt.buffer(), 0, termAtt.length()); document.add(term); numTerms++; } } elapsedTime += System.nanoTime() - initCPU; context.write(key, document); }
From source file:org.apache.hadoop.mapred.nativetask.testutil.BytesUtil.java
License:Apache License
private static Object newMahoutObject(byte[] seed, String className) { if (className.equals(VarIntWritable.class.getName())) { return new VarIntWritable(Bytes.toInt(seed)); } else if (className.equals(VarLongWritable.class.getName())) { return new VarLongWritable(Bytes.toLong(seed)); } else if (className.equals(TreeID.class.getName())) { TreeID treeID = new TreeID(); treeID.set(Bytes.toLong(seed));// www .j av a 2 s .c om return treeID; } else if (className.equals(SplitPartitionedWritable.class.getName())) { SplitPartitionedWritable spWritable = new SplitPartitionedWritable(); long taskItemOrdinal = Math.abs(Bytes.toLong(seed, 4)); spWritable.setTaskItemOrdinal(taskItemOrdinal); return spWritable; } else if (className.equals(EntityEntityWritable.class.getName())) { EntityEntityWritable entityWritable = new EntityEntityWritable(Bytes.toLong(seed, 0), Bytes.toLong(seed, 8)); return entityWritable; } else if (className.equals(Gram.class.getName())) { String ngram = Bytes.toStringBinary(seed); return new Gram(ngram, Gram.Type.NGRAM); } else if (className.equals(GramKey.class.getName())) { int primaryLength = r.nextInt(seed.length); Gram gram = new Gram(Bytes.toStringBinary(seed, 0, Math.max(primaryLength, 1)), Gram.Type.NGRAM); byte[] order = new byte[seed.length - primaryLength]; System.arraycopy(seed, primaryLength, order, 0, order.length); return new GramKey(gram, order); } else if (className.equals(StringTuple.class.getName())) { int tupleSize = r.nextInt(4); StringTuple stringTuple = new StringTuple(); for (int i = 0; i < tupleSize; i++) { int index = r.nextInt(seed.length); stringTuple.add(Bytes.toStringBinary(seed, index, seed.length - index)); } return stringTuple; } else { return null; } }