Example usage for org.apache.hadoop.io MapWritable keySet

List of usage examples for org.apache.hadoop.io MapWritable keySet

Introduction

In this page you can find the example usage for org.apache.hadoop.io MapWritable keySet.

Prototype

@Override
    public Set<Writable> keySet() 

Source Link

Usage

From source file:org.freeeed.mr.FreeEedReducer.java

License:Apache License

private DocumentMetadata getAllMetadata(MapWritable map) {
    DocumentMetadata metadata = new DocumentMetadata();
    Set<Writable> set = map.keySet();
    Iterator<Writable> iter = set.iterator();
    while (iter.hasNext()) {
        String name = iter.next().toString();
        if (!ParameterProcessing.NATIVE.equals(name) && !ParameterProcessing.NATIVE_AS_PDF.equals(name)
                && !name.startsWith(ParameterProcessing.NATIVE_AS_HTML)) { // all metadata but native - which is bytes!
            Text value = (Text) map.get(new Text(name));
            metadata.set(name, value.toString());
        }/*from  www .j a va 2s. c  o m*/
    }
    return metadata;
}

From source file:smile.wide.AttributeValueHistogram.java

License:Apache License

@Override
public int run(String[] arg) throws Exception {

    if (arg.length < 2) {
        s_logger.fatal("Usage: AttributeValueHistogram <infile> <outfile>");
        // TODO: return an error code?
    }//from w  w  w  .j  av  a2 s  . co m

    s_logger.debug("Got " + arg.length + " arguments");

    inPath_ = arg[0];
    s_logger.info("Input path is " + inPath_);

    // parse the key-value arguments passed - by now these are the arguments
    // specific to AttributeValueHistogram
    for (int i = 1; i < arg.length; ++i) {
        String[] tokens = arg[i].split("=");
        if (tokens.length != 2) {
            s_logger.fatal("Can't parse argument" + arg[i]);
        }

        if (tokens[0].equals("xdata.bayesnets.datasetreader.class")) {
            readerClass_ = tokens[1].trim();
            s_logger.debug("Set reader class to " + readerClass_);
        } else if (tokens[0].equals("xdata.bayesnets.datasetreader.filter")) {
            readerFilter_ = tokens[1].trim();
            s_logger.debug("Set reader filter to " + readerFilter_);
        } else if (tokens[0].equals("xdata.bayesnets.datasetreader.instid")) {
            readerInstID_ = tokens[1].trim();
            s_logger.debug("Set reader's instance ID column to " + readerInstID_);
        } else if (tokens[0].equals("xdata.bayesnets.datasetreader.variablenames")) {
            variableNames_ = tokens[1].trim();
            s_logger.debug("Set reader's variable names to " + variableNames_);
        } else {
            s_logger.warn("Unknown argument " + arg[i]);
        }
    }

    conf_ = getConf();

    // pass the reader class to the mapper, in jobconf      
    // TODO: use setClass here - fails early if wrong, not in the mapper
    conf_.set("xdata.bayesnets.datasetreader.class", readerClass_);
    conf_.set("xdata.bayesnets.datasetreader.filter", readerFilter_);
    // conf_.set("xdata.bayesnets.datasetreader.instid", readerInstID_); // not used
    conf_.set("xdata.bayesnets.datasetreader.variablenames", variableNames_);

    conf_.setBoolean("mapred.compress.map.output", true); // compress intermediate data
    conf_.set("mapred.output.compression.type", CompressionType.BLOCK.toString()); // by block, to keep splittable
    conf_.setClass("mapred.map.output.compression.codec", GzipCodec.class, CompressionCodec.class);

    // for debugging               
    conf_.set("keep.failed.task.files", "true");
    conf_.set("keep.failed.task.pattern", "*");

    Job job = new Job(conf_);

    job.setJarByClass(AttributeValueHistogram.class); // use this jar
    job.setJobName("Collect value histograms by attribute");

    FileInputFormat.addInputPath(job, new Path(inPath_));

    int rnd = (new Random()).nextInt();
    lastWorkingDir_ = job.getWorkingDirectory().toUri();
    s_logger.info("Job working directory is " + lastWorkingDir_);
    String tempDirName = job.getWorkingDirectory() + "/tmp/attvalhist" + rnd + ".tmp";
    s_logger.info("Temp files in directory " + tempDirName);
    FileOutputFormat.setOutputPath(job, new Path(tempDirName));

    job.setMapperClass(AttributeValueHistogramMapper.class);
    job.setCombinerClass(AttributeValueHistogramReducer.class);
    job.setReducerClass(AttributeValueHistogramReducer.class);

    // set both the map and reduce in/out classes
    job.setOutputKeyClass(Text.class); // the name of the attribute
    job.setOutputValueClass(MapWritable.class); // Value -> count map
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    // run'em
    int result = job.waitForCompletion(true) ? 0 : 16;

    // retain the temp file, collect the output      
    attributeValues_ = new TreeMap<String, Map<String, Integer>>();

    FileSystem fs = FileSystem.get(conf_);
    SequenceFile.Reader reader = null;

    Path resPath = new Path(tempDirName);
    FileStatus[] stats = fs.listStatus(resPath);

    // read all output files
    for (FileStatus stat : stats) {
        if (stat.getPath().toUri().toString().contains("part-r-"))
            try {
                s_logger.info("Reading results from " + stat.getPath());
                reader = new SequenceFile.Reader(fs, stat.getPath(), conf_);
                // Text key = (Text) ReflectionUtils.newInstance(reader.getKeyClass(), conf_);
                // MapWritable value = (MapWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf_);
                Text key = new Text();
                MapWritable value = new MapWritable();

                while (reader.next(key, value)) {
                    TreeMap<String, Integer> valueCounts = new TreeMap<String, Integer>();
                    for (Writable attValue : value.keySet()) {
                        valueCounts.put(((Text) attValue).toString(),
                                ((IntWritable) (value.get(attValue))).get());
                    }
                    attributeValues_.put(key.toString(), valueCounts);
                }
            } finally {
                IOUtils.closeStream(reader);
            }
    }

    fs.deleteOnExit(resPath);

    return result;
}

From source file:smile.wide.AttributeValueHistogramReducer.java

License:Apache License

@Override
public void reduce(Text key, Iterable<MapWritable> values, Context context)
        throws IOException, InterruptedException {
    // Let's have a map and internally collect them

    int maps = 0;
    int vals = 0;

    HashMap<Text, Integer> myMap = new HashMap<Text, Integer>();

    for (MapWritable m : values) {
        maps++;/* w w  w  .j a  v  a  2 s.  c  o m*/
        for (Writable valName : m.keySet()) {

            Text val = (Text) valName;
            Integer count = ((IntWritable) (m.get(valName))).get();
            if (myMap.containsKey(val)) {
                myMap.put(val, myMap.get(val) + count);
            } else {
                myMap.put(val, count);
                vals++;
            }
        }
    }

    s_logger.debug("Reducer/combiner got " + maps + " maps, with a total of " + vals
            + " distinct values for attribute `" + key + "`");

    // now output
    // key is key 
    // value is myMap as MapWritable<Text, IntWritable>

    MapWritable output = new MapWritable();
    for (Text t : myMap.keySet()) {
        s_logger.debug("Outputting count " + myMap.get(t) + " for attribute " + t);
        output.put(t, new IntWritable(myMap.get(t)));
    }

    context.write(key, output);

}