Example usage for org.apache.mahout.common StringTuple StringTuple

Introduction

In this page you can find the example usage for org.apache.mahout.common StringTuple StringTuple.

Prototype

public StringTuple()

Source Link

Usage

From source file:com.clustertest2.clustertest2.vectorization.DocTokenizer.java

public void performWork(Path doc) throws IOException {
    try {//from  w w  w  .j ava  2s .c om
        System.out.println("performing token work");
        HashMap<Text, StringTuple> tokenized = new HashMap<>();
        StringBuilder part = new StringBuilder();
        // store the tokens of each doc
        for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(doc, PathType.GLOB,
                ClusterFileService.CONF)) {
            String key = pair.getFirst().toString();
            System.out.println(key);
            String value = pair.getSecond().toString();
            part.append(key);
            TokenStream stream = analyzer.tokenStream(key, new StringReader(value));
            CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
            stream.reset();
            StringTuple document = new StringTuple();
            while (stream.incrementToken()) {
                if (termAtt.length() > 0) {
                    document.add(new String(termAtt.buffer(), 0, termAtt.length()));
                }
            }
            stream.end();
            stream.close();

            tokenized.put(new Text(key), document);
        }
        // write the sequencefile
        Path tokenizedSeq = new Path(vectorsDir, part.toString());
        try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS,
                ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) {
            for (Text k : tokenized.keySet()) {
                writer.append(k, tokenized.get(k));
            }
            writer.close();
            System.out.println("wrote");
        }
    } catch (Exception e) {
        System.out.println(e.getMessage());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:com.clustertest2.clustertest2.vectorization.TokenBuilder.java

@Override
public void performWork(Path inputDoc, Path outputDir) {
    try {/*w ww.  ja v  a  2 s .c om*/
        HashMap<Text, StringTuple> tokenized = new HashMap<>();
        StringBuilder docName = new StringBuilder();
        // store the tokens of each doc
        for (Pair<Writable, Writable> pair : new SequenceFileDirIterable<>(inputDoc, PathType.GLOB,
                ClusterFileService.CONF)) {
            String key = pair.getFirst().toString();
            String value = pair.getSecond().toString();
            docName.append(key);
            StringTuple document;
            try (TokenStream stream = analyzer.tokenStream(key, new StringReader(value))) {
                CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
                stream.reset();
                document = new StringTuple();
                while (stream.incrementToken()) {
                    if (termAtt.length() > 0) {
                        document.add(new String(termAtt.buffer(), 0, termAtt.length()));
                    }
                }
                stream.end();
            }
            tokenized.put(new Text(key), document);
        }
        // write the sequencefile
        Path tokenizedSeq = new Path(outputDir, docName.toString());
        // overwrite old vector file
        ClusterFileService.FS.delete(tokenizedSeq, true);
        try (SequenceFile.Writer writer = new SequenceFile.Writer(ClusterFileService.FS,
                ClusterFileService.CONF, tokenizedSeq, Text.class, StringTuple.class)) {
            for (Text k : tokenized.keySet()) {
                writer.append(k, tokenized.get(k));
            }
            writer.close();
        }
    } catch (IOException e) {
        System.out.println(e.getMessage());
    } finally {
        numThreads.decrementAndGet();
    }
}

From source file:com.digitalpebble.behemoth.mahout.BehemothTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException {
    StringTuple document = new StringTuple();
    Iterator<Annotation> iter = value.getAnnotations().iterator();

    while (iter.hasNext()) {
        Annotation annot = iter.next();
        // check the type
        if (!annot.getType().equals(tokenType))
            continue;
        java.util.Map<String, String> features = annot.getFeatures();
        if (features == null)
            continue;

        String featureValue = null;

        // no feature? use the underlying text
        if (tokenFeature.equals("")) {
            featureValue = value.getText().substring((int) annot.getStart(), (int) annot.getEnd());
        } else/*from  ww  w. j  a  va  2 s  . c  o m*/
            featureValue = features.get(tokenFeature);
        if (featureValue == null)
            continue;
        document.add(featureValue);

    }
    context.write(key, document);
}

From source file:com.digitalpebble.behemoth.mahout.LuceneTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, BehemothDocument value, Context context) throws IOException, InterruptedException {
    String sContent = value.getText();
    if (sContent == null) {
        // no text available? skip
        context.getCounter("LuceneTokenizer", "BehemothDocWithoutText").increment(1);
        return;/*from w  w w  .j  av  a2 s.  c o m*/
    }
    // analyzer.
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(sContent.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    StringTuple document = new StringTuple();
    stream.reset();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    context.write(key, document);
}

From source file:com.gsvic.csmr.io.InputData.java

License:Apache License

/**
 * Reads the tokenized document/*from w  w  w.ja  v a  2 s.com*/
 * @param conf
 * @param input
 * @return Returns the document tokens (StringTuples) in a HashMap
 * @throws IOException 
 */
public HashMap<Text, StringTuple> readTokenizedDocument(Configuration conf, Path input) throws IOException {
    FileSystem fs = FileSystem.get(conf);
    SequenceFile.Reader reader;
    reader = new SequenceFile.Reader(fs, input, conf);
    Text key = new Text();
    StringTuple value = new StringTuple();
    HashMap<Text, StringTuple> tokensMap = new HashMap<>();

    while (reader.next(key, value)) {
        tokensMap.put(new Text(key), new StringTuple(value.getEntries()));
    }

    return tokensMap;
}

From source file:com.ml.hadoop.nlp.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    stream.reset();/* w w  w  . j a  v  a 2  s  . c  o m*/
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    stream.end();
    Closeables.close(stream, true);

    //drop stop words
    document = StopWordsHandler.dropStopWords(document);
    context.write(key, document);
}

From source file:edu.indiana.d2i.htrc.io.DataCopyTokenizerMapper.java

License:Apache License

@Override
public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    long initCPU = System.nanoTime();
    TokenStream stream = analyzer.reusableTokenStream(key.toString(), new StringReader(value.toString()));
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    StringTuple document = new StringTuple();
    stream.reset();//from   w  w  w  .  j  av  a 2 s .  c  om
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            String term = new String(termAtt.buffer(), 0, termAtt.length());
            document.add(term);
            numTerms++;
        }
    }
    elapsedTime += System.nanoTime() - initCPU;

    context.write(key, document);
}

From source file:org.apache.hadoop.mapred.nativetask.testutil.BytesUtil.java

License:Apache License

private static Object newMahoutObject(byte[] seed, String className) {
    if (className.equals(VarIntWritable.class.getName())) {
        return new VarIntWritable(Bytes.toInt(seed));
    } else if (className.equals(VarLongWritable.class.getName())) {
        return new VarLongWritable(Bytes.toLong(seed));
    } else if (className.equals(TreeID.class.getName())) {
        TreeID treeID = new TreeID();
        treeID.set(Bytes.toLong(seed));// www .j av  a  2 s  .c  om
        return treeID;
    } else if (className.equals(SplitPartitionedWritable.class.getName())) {
        SplitPartitionedWritable spWritable = new SplitPartitionedWritable();
        long taskItemOrdinal = Math.abs(Bytes.toLong(seed, 4));
        spWritable.setTaskItemOrdinal(taskItemOrdinal);
        return spWritable;
    } else if (className.equals(EntityEntityWritable.class.getName())) {
        EntityEntityWritable entityWritable = new EntityEntityWritable(Bytes.toLong(seed, 0),
                Bytes.toLong(seed, 8));
        return entityWritable;
    } else if (className.equals(Gram.class.getName())) {
        String ngram = Bytes.toStringBinary(seed);
        return new Gram(ngram, Gram.Type.NGRAM);
    } else if (className.equals(GramKey.class.getName())) {
        int primaryLength = r.nextInt(seed.length);
        Gram gram = new Gram(Bytes.toStringBinary(seed, 0, Math.max(primaryLength, 1)), Gram.Type.NGRAM);
        byte[] order = new byte[seed.length - primaryLength];
        System.arraycopy(seed, primaryLength, order, 0, order.length);
        return new GramKey(gram, order);
    } else if (className.equals(StringTuple.class.getName())) {
        int tupleSize = r.nextInt(4);
        StringTuple stringTuple = new StringTuple();
        for (int i = 0; i < tupleSize; i++) {
            int index = r.nextInt(seed.length);
            stringTuple.add(Bytes.toStringBinary(seed, index, seed.length - index));
        }
        return stringTuple;
    } else {
        return null;
    }
}