Example usage for org.apache.mahout.common StringTuple StringTuple

List of usage examples for org.apache.mahout.common StringTuple StringTuple

Introduction

In this page you can find the example usage for org.apache.mahout.common StringTuple StringTuple.

Prototype

public StringTuple() 

Source Link

Usage

From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java

public void performWork(Path doc) throws IOException {
    try {//from  w w  w  .j ava  2s .c om
        System.out.println("performing token work");
        HashMap<Text, StringTuple> tokenized = new HashMap<>();
        StringBuilder part = new StringBuilder();
        // store the tokens of each doc
        for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB,
                ClusterFileService.CONF)) {
            String key = pair.getFirst().toString();
            System.out.println(key);
            String value = pair.getSecond().toString();
            part.append(key);
            TokenStream stream = analyzer.tokenStream(key, new StringReader(value));
            CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            StringTuple document = new StringTuple();
            while (stream.incrementToken()) {
                if (termAtt.length() > 0) {
                    document.add(new String(termAtt.buffer(), 0, termAtt.length()));
                }
            }
            stream.end();
            stream.close();

            tokenized.put(new Text(key), document);
        }
        // write the sequencefile
        Path tokenizedSeq = new Path(vectorsDir, part.toString());
        try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS,
                ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) {
            for (Text k : tokenized.keySet()) {
                writer.append(k, tokenized.get(k));
            }
            writer.close();
            System.out.println("wrote");
        }
    } catch (Exception e) {
        System.out.println(e.getMessage());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:com.clustertest2.clustertest2.vectorization.TokenBuilder.java

@Override
public void performWork(Path inputDoc, Path outputDir) {
    try {/*w ww.  ja v  a  2 s .c om*/
        HashMap<Text, StringTuple> tokenized = new HashMap<>();
        StringBuilder docName = new StringBuilder();
        // store the tokens of each doc
        for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(inputDoc, PathType.GLOB,
                ClusterFileService.CONF)) {
            String key = pair.getFirst().toString();
            String value = pair.getSecond().toString();
            docName.append(key);
            StringTuple document;
            try (TokenStream stream = analyzer.tokenStream(key, new StringReader(value))) {
                CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
                stream.reset();
                document = new StringTuple();
                while (stream.incrementToken()) {
                    if (termAtt.length() > 0) {
                        document.add(new String(termAtt.buffer(), 0, termAtt.length()));
                    }
                }
                stream.end();
            }
            tokenized.put(new Text(key), document);
        }
        // write the sequencefile
        Path tokenizedSeq = new Path(outputDir, docName.toString());
        // overwrite old vector file
        ClusterFileService.FS.delete(tokenizedSeq, true);
        try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS,
                ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) {
            for (Text k : tokenized.keySet()) {
                writer.append(k, tokenized.get(k));
            }
            writer.close();
        }
    } catch (IOException e) {
        System.out.println(e.getMessage());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:com.digitalpebble.behemoth.mahout.BehemothTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException {
    StringTuple document = new StringTuple();
    Iterator<Annotation> iter = value.getAnnotations().iterator();

    while (iter.hasNext()) {
        Annotation annot = iter.next();
        // check the type
        if (!annot.getType().equals(tokenType))
            continue;
        java.util.Map<String, String> features = annot.getFeatures();
        if (features == null)
            continue;

        String featureValue = null;

        // no feature? use the underlying text
        if (tokenFeature.equals("")) {
            featureValue = value.getText().substring((int) annot.getStart(), (int) annot.getEnd());
        } else/*from  ww  w. j  a  va  2 s  . c  o m*/
            featureValue = features.get(tokenFeature);
        if (featureValue == null)
            continue;
        document.add(featureValue);

    }
    context.write(key, document);
}

From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException {
    String sContent = value.getText();
    if (sContent == null) {
        // no text available? skip
        context.getCounter("LuceneTokenizer", "BehemothDocWithoutText").increment(1);
        return;/*from w  w w  .j  av  a2 s.  c o m*/
    }
    // analyzer.
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(sContent.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    StringTuple document = new StringTuple();
    stream.reset();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    context.write(key, document);
}

From source file:com.gsvic.csmr.io.InputData.java

License:Apache License

/**
 * Reads the tokenized document/*from w  w  w.ja  v a  2 s.com*/
 * @param conf
 * @param input
 * @return Returns the document tokens (StringTuples) in a HashMap
 * @throws IOException 
 */
public HashMap<Text, StringTuple> readTokenizedDocument(Configuration conf, Path input) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Reader reader;
    reader = new SequenceFile.Reader(fs, input, conf);
    Text key = new Text();
    StringTuple value = new StringTuple();
    HashMap<Text, StringTuple> tokensMap = new HashMap<>();

    while (reader.next(key, value)) {
        tokensMap.put(new Text(key), new StringTuple(value.getEntries()));
    }

    return tokensMap;
}

From source file:com.ml.hadoop.nlp.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    stream.reset();/* w w  w  . j a  v  a 2  s  . c  o m*/
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    stream.end();
    Closeables.close(stream, true);

    //drop stop words
    document = StopWordsHandler.dropStopWords(document);
    context.write(key, document);
}

From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerMapper.java

License:Apache License

@Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    long initCPU = System.nanoTime();
    TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(value.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    StringTuple document = new StringTuple();
    stream.reset();//from   w  w  w  .  j  av  a 2 s .  c  om
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            String term = new String(termAtt.buffer(), 0, termAtt.length());
            document.add(term);
            numTerms++;
        }
    }
    elapsedTime += System.nanoTime() - initCPU;

    context.write(key, document);
}

From source file:org.apache.hadoop.mapred.nativetask.testutil.BytesUtil.java

License:Apache License

private static Object newMahoutObject(byte[] seed, String className) {
    if (className.equals(VarIntWritable.class.getName())) {
        return new VarIntWritable(Bytes.toInt(seed));
    } else if (className.equals(VarLongWritable.class.getName())) {
        return new VarLongWritable(Bytes.toLong(seed));
    } else if (className.equals(TreeID.class.getName())) {
        TreeID treeID = new TreeID();
        treeID.set(Bytes.toLong(seed));// www .j av  a  2 s  .c  om
        return treeID;
    } else if (className.equals(SplitPartitionedWritable.class.getName())) {
        SplitPartitionedWritable spWritable = new SplitPartitionedWritable();
        long taskItemOrdinal = Math.abs(Bytes.toLong(seed, 4));
        spWritable.setTaskItemOrdinal(taskItemOrdinal);
        return spWritable;
    } else if (className.equals(EntityEntityWritable.class.getName())) {
        EntityEntityWritable entityWritable = new EntityEntityWritable(Bytes.toLong(seed, 0),
                Bytes.toLong(seed, 8));
        return entityWritable;
    } else if (className.equals(Gram.class.getName())) {
        String ngram = Bytes.toStringBinary(seed);
        return new Gram(ngram, Gram.Type.NGRAM);
    } else if (className.equals(GramKey.class.getName())) {
        int primaryLength = r.nextInt(seed.length);
        Gram gram = new Gram(Bytes.toStringBinary(seed, 0, Math.max(primaryLength, 1)), Gram.Type.NGRAM);
        byte[] order = new byte[seed.length - primaryLength];
        System.arraycopy(seed, primaryLength, order, 0, order.length);
        return new GramKey(gram, order);
    } else if (className.equals(StringTuple.class.getName())) {
        int tupleSize = r.nextInt(4);
        StringTuple stringTuple = new StringTuple();
        for (int i = 0; i < tupleSize; i++) {
            int index = r.nextInt(seed.length);
            stringTuple.add(Bytes.toStringBinary(seed, index, seed.length - index));
        }
        return stringTuple;
    } else {
        return null;
    }
}