Example usage for org.apache.hadoop.io Text toString

List of usage examples for org.apache.hadoop.io Text toString

Introduction

In this page you can find the example usage for org.apache.hadoop.io Text toString.

Prototype

@Override
public String toString() 

Source Link

Document

Convert text back to string

Usage

From source file:com.me.neu.popular_tag_year.Mapper1.java

@Override
public void map(LongWritable k1, Text v1, OutputCollector<Text, Text> output, Reporter rprtr)
        throws IOException {
    String[] line = v1.toString().split(",");
    String[] tagNames = line[0].split(">");

    for (String tagName : tagNames) {
        output.collect(new Text(line[1]), new Text(tagName));
    }/*from w ww  . j a  v  a2s  .  c om*/

}

From source file:com.me.neu.stackoverflow.Mapper1.java

@Override
public void map(LongWritable k1, Text v1, OutputCollector<Text, Text> output, Reporter rprtr)
        throws IOException {
    String[] line = v1.toString().split(",");
    String[] tagNames = line[1].split(">");

    for (String tagName : tagNames) {
        output.collect(new Text(line[0]), new Text(tagName));
    }//from  ww w  .j  a v a 2 s. c om

}

From source file:com.mh2c.WikipediaDumpLoaderMapper.java

License:Apache License

/**
 * key = article content//from w w  w  .j a  v a2  s .  c  om
 * value = empty string
 */
@Override
public void map(Text key, Text value, OutputCollector<Text, Text> output, Reporter reporter)
        throws IOException {
    try {
        // Parse the page of XML into a document
        Document doc = db.parse(new InputSource(new StringReader(key.toString())));

        // Extract the title and text (article content) from the page content
        String title = doc.getElementsByTagName("title").item(0).getTextContent();
        String text = doc.getElementsByTagName("text").item(0).getTextContent();

        // Emit the title and text pair
        output.collect(new Text(title), new Text(text));
        reporter.getCounter(Counter.ARTICLES).increment(1L);
    } catch (SAXException e) {
        throw new IOException(e);
    }
}

From source file:com.mh2c.WikipediaWordCountMapper.java

License:Apache License

/**
 * key = title//w  w w .ja  v  a  2  s.c om
 * value = text
 */
@Override
public void map(Text key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
        throws IOException {
    // Split the text content of the article on whitespace
    String[] words = value.toString().split("\\s+");
    // Count each word occurrence
    for (String word : words) {
        wordText.set(word);
        output.collect(wordText, ONE);
    }
}

From source file:com.missionsky.scp.dataanalysis.mahout.TestNaiveBayesDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    addInputOption();//from   www  .  ja va 2  s . co m
    addOutputOption();
    addOption(addOption(DefaultOptionCreator.overwriteOption().create()));
    addOption("model", "m", "The path to the model built during training", true);
    addOption(
            buildOption("testComplementary", "c", "test complementary?", false, false, String.valueOf(false)));
    addOption(buildOption("runSequential", "seq", "run sequential?", false, false, String.valueOf(false)));
    addOption("labelIndex", "l", "The path to the location of the label index", true);
    Map<String, List<String>> parsedArgs = parseArguments(args);
    if (parsedArgs == null) {
        return -1;
    }
    if (hasOption(DefaultOptionCreator.OVERWRITE_OPTION)) {
        HadoopUtil.delete(getConf(), getOutputPath());
    }

    boolean complementary = hasOption("testComplementary");
    boolean sequential = hasOption("runSequential");
    if (sequential) {
        FileSystem fs = FileSystem.get(getConf());
        NaiveBayesModel model = NaiveBayesModel.materialize(new Path(getOption("model")), getConf());
        AbstractNaiveBayesClassifier classifier;
        if (complementary) {
            classifier = new ComplementaryNaiveBayesClassifier(model);
        } else {
            classifier = new StandardNaiveBayesClassifier(model);
        }
        SequenceFile.Writer writer = new SequenceFile.Writer(fs, getConf(), getOutputPath(), Text.class,
                VectorWritable.class);
        Reader reader = new Reader(fs, getInputPath(), getConf());
        Text key = new Text();
        VectorWritable vw = new VectorWritable();
        while (reader.next(key, vw)) {
            writer.append(new Text(SLASH.split(key.toString())[1]),
                    new VectorWritable(classifier.classifyFull(vw.get())));
        }
        writer.close();
        reader.close();
    } else {
        boolean succeeded = runMapReduce(parsedArgs);
        if (!succeeded) {
            return -1;
        }
    }

    //load the labels
    Map<Integer, String> labelMap = BayesUtils.readLabelIndex(getConf(), new Path(getOption("labelIndex")));

    //loop over the results and create the confusion matrix
    SequenceFileDirIterable<Text, VectorWritable> dirIterable = new SequenceFileDirIterable<Text, VectorWritable>(
            getOutputPath(), PathType.LIST, PathFilters.partFilter(), getConf());
    ResultAnalyzer analyzer = new ResultAnalyzer(labelMap.values(), "DEFAULT");
    analyzeResults(labelMap, dirIterable, analyzer);

    log.info("{} Results: {}", complementary ? "Complementary" : "Standard NB", analyzer);
    return 0;
}

From source file:com.ml.hadoop.nlp.SequenceFileTokenizerMapper.java

License:Apache License

@Override
protected void map(Text key, Text value, Context context) throws IOException, InterruptedException {
    TokenStream stream = analyzer.tokenStream(key.toString(), new StringReader(value.toString()));
    stream.reset();//w w w.j  a  v  a 2  s  .c  o m
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    StringTuple document = new StringTuple();
    while (stream.incrementToken()) {
        if (termAtt.length() > 0) {
            document.add(new String(termAtt.buffer(), 0, termAtt.length()));
        }
    }
    stream.end();
    Closeables.close(stream, true);

    //drop stop words
    document = StopWordsHandler.dropStopWords(document);
    context.write(key, document);
}

From source file:com.ML_Hadoop.K_meansClustering.K_meansClusteringMap.java

@Override // the word '@Override' is necessary. otherwise it runs defaults map()
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    // Read each line of input file and extract each value of data  
    String value_of_a_line_of_a_input_file = value.toString();
    String[] split_data_of_the_line = value_of_a_line_of_a_input_file.split("\t"); // split line by TAB
    Float[] values_of_data_of_the_line = new Float[feature_size];

    for (int i = 0; i < feature_size; i++)
        values_of_data_of_the_line[i] = Float.parseFloat(split_data_of_the_line[i]);

    similarityCheck(values_of_data_of_the_line);

}

From source file:com.ML_Hadoop.MultipleLinearRegression.MultipleLinearRegressionMap.java

@Override // necessary otherwise it runs default map()
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String line = value.toString();
    String[] features = line.split(","); // split text into two texts: x, y
    ArrayList<Float> values = new ArrayList<Float>();

    for (int i = 0; i < features.length; i++)
        values.add(new Float(features[i]));

    prediction_error.add(compute_J(values));
}

From source file:com.ML_Hadoop.NaiveBayesClassifier_Continuous_Features.NaiveBayesClassifierMap_Continuous_Features.java

@Override // necessary otherwise it runs default map()
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String line = value.toString();
    String[] features_str = line.split(","); // split text into separate strings 
    class_id = Integer.parseInt(features_str[0]);
    features = new Float[features_str.length - 1];

    for (int i = 0; i < features.length; i++) {
        features[i] = Float.parseFloat(features_str[i + 1]);
    }// w w  w. j  av  a  2s  .  c o m
    ArrayList<Float[]> t;
    if (features_probabilities.get(class_id) == null) {
        t = new ArrayList<Float[]>();
    } else {
        t = features_probabilities.get(class_id);
    }
    t.add(features);
    features_probabilities.put(class_id, t);
    num_of_members_in_each_class[class_id]++;

}

From source file:com.mortardata.pig.JsonLoader.java

License:Apache License

@Override
public Tuple getNext() throws IOException {
    Text val = null;
    try {// w  w w.j  a v a 2 s  .co  m
        if (!reader.nextKeyValue())
            return null;
        val = (Text) reader.getCurrentValue();
    } catch (Exception e) {
        throw new IOException(e);
    }

    // Create a parser specific for this input line.  
    // This may not be the most efficient approach.
    ByteArrayInputStream bais = new ByteArrayInputStream(val.getBytes());
    JsonParser p = jsonFactory.createJsonParser(bais);

    Tuple t;

    // schema provided
    if (!useDefaultSchema) {
        // Create a map of field names to ResourceFieldSchema's,
        // and create a map of field names to positions in the tuple.
        // These are used during parsing to handle extra, missing, and/or out-of-order
        // fields properly.

        Map<String, ResourceFieldSchema> schemaMap = new HashMap<String, ResourceFieldSchema>();
        Map<String, Integer> schemaPositionMap = new HashMap<String, Integer>();

        if (requiredFields != null) {
            int count = 0;
            for (int i = 0; i < fields.length; i++) {
                if (requiredFields[i]) {
                    schemaMap.put(fields[i].getName(), fields[i]);
                    schemaPositionMap.put(fields[i].getName(), count);
                    count++;
                }
            }
            t = tupleFactory.newTuple(count);
        } else {
            for (int i = 0; i < fields.length; i++) {
                schemaMap.put(fields[i].getName(), fields[i]);
                schemaPositionMap.put(fields[i].getName(), i);
            }
            t = tupleFactory.newTuple(fields.length);
        }

        try {
            p.nextToken(); // move to start of object
            parseObjectIntoTuple(val.toString(), p, schemaMap, schemaPositionMap, t);
        } catch (JsonParseException jpe) {
            // If the line doesn't parse as a valid JSON object, log an error and move on
            log.error("Error parsing record: " + val + ": " + jpe.toString());
        }
    } else {
        // schema not provided: load whole document as a map
        t = tupleFactory.newTuple(1);
        try {
            p.nextToken(); // move to start of object
            t.set(0, readField(val.toString(), p, schema.getFields()[0]));
        } catch (JsonParseException jpe) {
            log.error("Error parsing record: " + val + ": " + jpe.toString());
        }
    }

    p.close();
    return t;
}