Example usage for org.apache.hadoop.mapreduce Job getWorkingDirectory

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getWorkingDirectory.

Prototype

public Path getWorkingDirectory() throws IOException

Source Link

Document

Get the current working directory for the default file system.

Usage

From source file:com.ailk.oci.ocnosql.tools.load.single.SingleColumnImportTsv.java

License:Apache License

/**
 * Configure a MapReduce Job to perform an incremental load into the given
 * table. This/*from   w  w  w.j a va  2 s  .com*/
 * <ul>
 *   <li>Inspects the table to configure a total order partitioner</li>
 *   <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li>
 *   <li>Sets the number of reduce tasks to match the current number of regions</li>
 *   <li>Sets the output key/value class to match HFileOutputFormat's requirements</li>
 *   <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or
 *     PutSortReducer)</li>
 * </ul>
 * The user should be sure to set the map output value class to either KeyValue or Put before
 * running this function.
 */
public static void configureIncrementalLoad(Job job, HTable table) throws IOException {
    Configuration conf = job.getConfiguration();
    Class<? extends Partitioner> topClass;
    try {
        topClass = getTotalOrderPartitionerClass();
    } catch (ClassNotFoundException e) {
        throw new IOException("Failed getting TotalOrderPartitioner", e);
    }
    //partition
    job.setPartitionerClass(topClass);
    //Set the key class for the job output data
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    //Set the value class for job outputs
    job.setOutputValueClass(KeyValue.class);
    //outputformatHfile
    job.setOutputFormatClass(HFileOutputFormat2.class);

    // Based on the configured map output class, set the correct reducer to properly
    // sort the incoming values.
    // TODO it would be nice to pick one or the other of these formats.
    if (KeyValue.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(KeyValueSortReducer.class);
    } else if (Put.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(SingleColumnReducer.class);
    } else {
        LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
    }

    LOG.info("Looking up current regions for table " + table);
    //?regionstarkey
    List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
    LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count");

    //?region?reduce?
    job.setNumReduceTasks(startKeys.size());

    Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_" + UUID.randomUUID());
    LOG.info("Writing partition information to " + partitionsPath);

    FileSystem fs = partitionsPath.getFileSystem(conf);
    writePartitions(conf, partitionsPath, startKeys);
    partitionsPath.makeQualified(fs);

    URI cacheUri;
    try {
        // Below we make explicit reference to the bundled TOP.  Its cheating.
        // We are assume the define in the hbase bundled TOP is as it is in
        // hadoop (whether 0.20 or 0.22, etc.)
        /*
          cacheUri = new URI(partitionsPath.toString() + "#" +
            org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner.DEFAULT_PATH);
            */
        cacheUri = new URI(partitionsPath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    DistributedCache.addCacheFile(cacheUri, conf);
    DistributedCache.createSymlink(conf);

    // Set compression algorithms based on column families
    configureCompression(table, conf);

    TableMapReduceUtil.addDependencyJars(job);
    LOG.info("Incremental table output configured.");
}

From source file:com.ci.backports.hadoop.hbase.ZHFileOutputFormat.java

License:Apache License

/**
 * Configure a MapReduce Job to perform an incremental load into the given
 * table. This//from   w  w  w.ja  va2 s .  co m
 * <ul>
 *   <li>Inspects the table to configure a total order partitioner</li>
 *   <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li>
 *   <li>Sets the number of reduce tasks to match the current number of regions</li>
 *   <li>Sets the output key/value class to match ZHFileOutputFormat's requirements</li>
 *   <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or
 *     ZPutSortReducer)</li>
 * </ul> 
 * The user should be sure to set the map output value class to either KeyValue or Put before
 * running this function.
 */
public static void configureIncrementalLoad(Job job, HTable table) throws IOException {
    Configuration conf = job.getConfiguration();
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setOutputKeyClass(ImmutableBytesWritable.class);
    job.setOutputValueClass(KeyValue.class);
    job.setOutputFormatClass(ZHFileOutputFormat.class);

    // Based on the configured map output class, set the correct reducer to properly
    // sort the incoming values.
    // TODO it would be nice to pick one or the other of these formats.
    if (KeyValue.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(KeyValueSortReducer.class);
    } else if (Put.class.equals(job.getMapOutputValueClass())) {
        job.setReducerClass(ZPutSortReducer.class);
    } else {
        LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass());
    }

    LOG.info("Looking up current regions for table " + table);
    List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table);
    LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count");
    job.setNumReduceTasks(startKeys.size());

    Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis());
    LOG.info("Writing partition information to " + partitionsPath);

    FileSystem fs = partitionsPath.getFileSystem(conf);
    writePartitions(conf, partitionsPath, startKeys);
    partitionsPath.makeQualified(fs);
    URI cacheUri;
    try {
        cacheUri = new URI(partitionsPath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    DistributedCache.addCacheFile(cacheUri, conf);
    DistributedCache.createSymlink(conf);

    LOG.info("Incremental table output configured.");
}

From source file:com.moz.fiji.mapreduce.output.HFileMapReduceJobOutput.java

License:Apache License

/**
 * Configures the partitioner for generating HFiles.
 *
 * <p>Each generated HFile should fit within a region of of the target table.
 * Additionally, it's optimal to have only one HFile to load into each region, since a
 * read from that region will require reading from each HFile under management (until
 * compaction happens and merges them all back into one HFile).</p>
 *
 * <p>To achieve this, we configure a TotalOrderPartitioner that will partition the
 * records output from the Mapper based on their rank in a total ordering of the
 * keys.  The <code>startKeys</code> argument should contain a list of the first key in
 * each of those partitions.</p>// www . j  a v  a  2s  .c o m
 *
 * @param job The job to configure.
 * @param startKeys A list of keys that will mark the boundaries between the partitions
 *     for the sorted map output records.
 * @throws IOException If there is an error.
 */
public static void configurePartitioner(Job job, List<HFileKeyValue> startKeys) throws IOException {
    FijiMRPlatformBridge.get().setTotalOrderPartitionerClass(job);

    LOG.info("Configuring " + startKeys.size() + " reduce partitions.");
    job.setNumReduceTasks(startKeys.size());

    // Write the file that the TotalOrderPartitioner reads to determine where to partition records.
    Path partitionFilePath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis());
    LOG.info("Writing partition information to " + partitionFilePath);

    final FileSystem fs = partitionFilePath.getFileSystem(job.getConfiguration());
    partitionFilePath = partitionFilePath.makeQualified(fs);
    writePartitionFile(job.getConfiguration(), partitionFilePath, startKeys);

    // Add it to the distributed cache.
    try {
        final URI cacheUri = new URI(partitionFilePath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
        DistributedCache.addCacheFile(cacheUri, job.getConfiguration());
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    DistributedCache.createSymlink(job.getConfiguration());
}

From source file:io.druid.indexer.updater.HadoopConverterJob.java

License:Apache License

public static void cleanup(Job job) throws IOException {
    final Path jobDir = getJobPath(job.getJobID(), job.getWorkingDirectory());
    final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
    fs.delete(jobDir, true);//  w  ww .  j a  v  a  2 s . co m
    fs.delete(getJobClassPathDir(job.getJobName(), job.getWorkingDirectory()), true);
}

From source file:io.druid.indexer.updater.HadoopConverterJob.java

License:Apache License

public List<DataSegment> run() throws IOException {
    final JobConf jobConf = new JobConf();
    jobConf.setKeepFailedTaskFiles(false);
    for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) {
        jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()");
    }// ww  w.  jav  a 2 s. c  o m
    final List<DataSegment> segments = converterConfig.getSegments();
    if (segments.isEmpty()) {
        throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource());
    }
    converterConfigIntoConfiguration(converterConfig, segments, jobConf);

    jobConf.setNumReduceTasks(0);// Map only. Number of map tasks determined by input format
    jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache()));

    setJobName(jobConf, segments);

    if (converterConfig.getJobPriority() != null) {
        jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority()));
    }

    final Job job = Job.getInstance(jobConf);

    job.setInputFormatClass(ConfigInputFormat.class);
    job.setMapperClass(ConvertingMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapSpeculativeExecution(false);
    job.setOutputFormatClass(ConvertingOutputFormat.class);

    JobHelper.setupClasspath(JobHelper.distributedClassPath(jobConf.getWorkingDirectory()),
            JobHelper.distributedClassPath(getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())),
            job);

    Throwable throwable = null;
    try {
        job.submit();
        log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());
        final boolean success = job.waitForCompletion(true);
        if (!success) {
            final TaskReport[] reports = job.getTaskReports(TaskType.MAP);
            if (reports != null) {
                for (final TaskReport report : reports) {
                    log.error("Error in task [%s] : %s", report.getTaskId(),
                            Arrays.toString(report.getDiagnostics()));
                }
            }
            return null;
        }
        try {
            loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue();
            writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue();
        } catch (IOException ex) {
            log.error(ex, "Could not fetch counters");
        }
        final JobID jobID = job.getJobID();

        final Path jobDir = getJobPath(jobID, job.getWorkingDirectory());
        final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
        final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true);
        final List<Path> goodPaths = new ArrayList<>();
        while (it.hasNext()) {
            final LocatedFileStatus locatedFileStatus = it.next();
            if (locatedFileStatus.isFile()) {
                final Path myPath = locatedFileStatus.getPath();
                if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) {
                    goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY));
                }
            }
        }
        if (goodPaths.isEmpty()) {
            log.warn("No good data found at [%s]", jobDir);
            return null;
        }
        final List<DataSegment> returnList = ImmutableList
                .copyOf(Lists.transform(goodPaths, new Function<Path, DataSegment>() {
                    @Nullable
                    @Override
                    public DataSegment apply(final Path input) {
                        try {
                            if (!fs.exists(input)) {
                                throw new ISE("Somehow [%s] was found but [%s] is missing at [%s]",
                                        ConvertingOutputFormat.DATA_SUCCESS_KEY,
                                        ConvertingOutputFormat.DATA_FILE_KEY, jobDir);
                            }
                        } catch (final IOException e) {
                            throw Throwables.propagate(e);
                        }
                        try (final InputStream stream = fs.open(input)) {
                            return HadoopDruidConverterConfig.jsonMapper.readValue(stream, DataSegment.class);
                        } catch (final IOException e) {
                            throw Throwables.propagate(e);
                        }
                    }
                }));
        if (returnList.size() == segments.size()) {
            return returnList;
        } else {
            throw new ISE(
                    "Tasks reported success but result length did not match! Expected %d found %d at path [%s]",
                    segments.size(), returnList.size(), jobDir);
        }
    } catch (InterruptedException | ClassNotFoundException e) {
        RuntimeException exception = Throwables.propagate(e);
        throwable = exception;
        throw exception;
    } catch (Throwable t) {
        throwable = t;
        throw t;
    } finally {
        try {
            cleanup(job);
        } catch (IOException e) {
            if (throwable != null) {
                throwable.addSuppressed(e);
            } else {
                log.error(e, "Could not clean up job [%s]", job.getJobID());
            }
        }
    }
}

From source file:org.apache.druid.indexer.updater.HadoopConverterJob.java

License:Apache License

public static void cleanup(Job job) throws IOException {
    final Path jobDir = getJobPath(job.getJobID(), job.getWorkingDirectory());
    final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
    RuntimeException e = null;// w w w .  j av  a2 s  . c o m
    try {
        JobHelper.deleteWithRetry(fs, jobDir, true);
    } catch (RuntimeException ex) {
        e = ex;
    }
    try {
        JobHelper.deleteWithRetry(fs, getJobClassPathDir(job.getJobName(), job.getWorkingDirectory()), true);
    } catch (RuntimeException ex) {
        if (e == null) {
            e = ex;
        } else {
            e.addSuppressed(ex);
        }
    }
    if (e != null) {
        throw e;
    }
}

From source file:org.apache.druid.indexer.updater.HadoopConverterJob.java

License:Apache License

public List<DataSegment> run() throws IOException {
    final JobConf jobConf = new JobConf();
    jobConf.setKeepFailedTaskFiles(false);
    for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) {
        jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()");
    }//from ww  w . j  a  v  a 2s.c o  m
    final List<DataSegment> segments = converterConfig.getSegments();
    if (segments.isEmpty()) {
        throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource());
    }
    converterConfigIntoConfiguration(converterConfig, segments, jobConf);

    jobConf.setNumReduceTasks(0); // Map only. Number of map tasks determined by input format
    jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache()));

    setJobName(jobConf, segments);

    if (converterConfig.getJobPriority() != null) {
        jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority()));
    }

    final Job job = Job.getInstance(jobConf);

    job.setInputFormatClass(ConfigInputFormat.class);
    job.setMapperClass(ConvertingMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapSpeculativeExecution(false);
    job.setOutputFormatClass(ConvertingOutputFormat.class);

    JobHelper.setupClasspath(JobHelper.distributedClassPath(jobConf.getWorkingDirectory()),
            JobHelper.distributedClassPath(getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())),
            job);

    Throwable throwable = null;
    try {
        job.submit();
        log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());
        final boolean success = job.waitForCompletion(true);
        if (!success) {
            final TaskReport[] reports = job.getTaskReports(TaskType.MAP);
            if (reports != null) {
                for (final TaskReport report : reports) {
                    log.error("Error in task [%s] : %s", report.getTaskId(),
                            Arrays.toString(report.getDiagnostics()));
                }
            }
            return null;
        }
        try {
            loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue();
            writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue();
        } catch (IOException ex) {
            log.error(ex, "Could not fetch counters");
        }
        final JobID jobID = job.getJobID();

        final Path jobDir = getJobPath(jobID, job.getWorkingDirectory());
        final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
        final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true);
        final List<Path> goodPaths = new ArrayList<>();
        while (it.hasNext()) {
            final LocatedFileStatus locatedFileStatus = it.next();
            if (locatedFileStatus.isFile()) {
                final Path myPath = locatedFileStatus.getPath();
                if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) {
                    goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY));
                }
            }
        }
        if (goodPaths.isEmpty()) {
            log.warn("No good data found at [%s]", jobDir);
            return null;
        }
        final List<DataSegment> returnList = ImmutableList
                .copyOf(Lists.transform(goodPaths, new Function<Path, DataSegment>() {
                    @Nullable
                    @Override
                    public DataSegment apply(final Path input) {
                        try {
                            if (!fs.exists(input)) {
                                throw new ISE("Somehow [%s] was found but [%s] is missing at [%s]",
                                        ConvertingOutputFormat.DATA_SUCCESS_KEY,
                                        ConvertingOutputFormat.DATA_FILE_KEY, jobDir);
                            }
                        } catch (final IOException e) {
                            throw Throwables.propagate(e);
                        }
                        try (final InputStream stream = fs.open(input)) {
                            return HadoopDruidConverterConfig.jsonMapper.readValue(stream, DataSegment.class);
                        } catch (final IOException e) {
                            throw Throwables.propagate(e);
                        }
                    }
                }));
        if (returnList.size() == segments.size()) {
            return returnList;
        } else {
            throw new ISE(
                    "Tasks reported success but result length did not match! Expected %d found %d at path [%s]",
                    segments.size(), returnList.size(), jobDir);
        }
    } catch (InterruptedException | ClassNotFoundException e) {
        RuntimeException exception = Throwables.propagate(e);
        throwable = exception;
        throw exception;
    } catch (Throwable t) {
        throwable = t;
        throw t;
    } finally {
        try {
            cleanup(job);
        } catch (IOException e) {
            if (throwable != null) {
                throwable.addSuppressed(e);
            } else {
                log.error(e, "Could not clean up job [%s]", job.getJobID());
            }
        }
    }
}

From source file:org.kiji.mapreduce.output.HFileMapReduceJobOutput.java

License:Apache License

/**
 * Configures the partitioner for generating HFiles.
 *
 * <p>Each generated HFile should fit within a region of of the target table.
 * Additionally, it's optimal to have only one HFile to load into each region, since a
 * read from that region will require reading from each HFile under management (until
 * compaction happens and merges them all back into one HFile).</p>
 *
 * <p>To achieve this, we configure a TotalOrderPartitioner that will partition the
 * records output from the Mapper based on their rank in a total ordering of the
 * keys.  The <code>startKeys</code> argument should contain a list of the first key in
 * each of those partitions.</p>/*from   www . ja  v a2s . c om*/
 *
 * @param job The job to configure.
 * @param startKeys A list of keys that will mark the boundaries between the partitions
 *     for the sorted map output records.
 * @throws IOException If there is an error.
 */
private static void configurePartitioner(Job job, List<HFileKeyValue> startKeys) throws IOException {
    job.setPartitionerClass(TotalOrderPartitioner.class);

    LOG.info("Configuring " + startKeys.size() + " reduce partitions.");
    job.setNumReduceTasks(startKeys.size());

    // Write the file that the TotalOrderPartitioner reads to determine where to partition records.
    Path partitionFilePath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis());
    LOG.info("Writing partition information to " + partitionFilePath);

    final FileSystem fs = partitionFilePath.getFileSystem(job.getConfiguration());
    partitionFilePath = partitionFilePath.makeQualified(fs);
    writePartitionFile(job.getConfiguration(), partitionFilePath, startKeys);

    // Add it to the distributed cache.
    try {
        final URI cacheUri = new URI(partitionFilePath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH);
        DistributedCache.addCacheFile(cacheUri, job.getConfiguration());
    } catch (URISyntaxException e) {
        throw new IOException(e);
    }
    DistributedCache.createSymlink(job.getConfiguration());
}

From source file:smile.wide.AttributeValueHistogram.java

License:Apache License

@Override
public int run(String[] arg) throws Exception {

    if (arg.length < 2) {
        s_logger.fatal("Usage: AttributeValueHistogram <infile> <outfile>");
        // TODO: return an error code?
    }//from  ww w.  j a v a2s  .  c  o m

    s_logger.debug("Got " + arg.length + " arguments");

    inPath_ = arg[0];
    s_logger.info("Input path is " + inPath_);

    // parse the key-value arguments passed - by now these are the arguments
    // specific to AttributeValueHistogram
    for (int i = 1; i < arg.length; ++i) {
        String[] tokens = arg[i].split("=");
        if (tokens.length != 2) {
            s_logger.fatal("Can't parse argument" + arg[i]);
        }

        if (tokens[0].equals("xdata.bayesnets.datasetreader.class")) {
            readerClass_ = tokens[1].trim();
            s_logger.debug("Set reader class to " + readerClass_);
        } else if (tokens[0].equals("xdata.bayesnets.datasetreader.filter")) {
            readerFilter_ = tokens[1].trim();
            s_logger.debug("Set reader filter to " + readerFilter_);
        } else if (tokens[0].equals("xdata.bayesnets.datasetreader.instid")) {
            readerInstID_ = tokens[1].trim();
            s_logger.debug("Set reader's instance ID column to " + readerInstID_);
        } else if (tokens[0].equals("xdata.bayesnets.datasetreader.variablenames")) {
            variableNames_ = tokens[1].trim();
            s_logger.debug("Set reader's variable names to " + variableNames_);
        } else {
            s_logger.warn("Unknown argument " + arg[i]);
        }
    }

    conf_ = getConf();

    // pass the reader class to the mapper, in jobconf      
    // TODO: use setClass here - fails early if wrong, not in the mapper
    conf_.set("xdata.bayesnets.datasetreader.class", readerClass_);
    conf_.set("xdata.bayesnets.datasetreader.filter", readerFilter_);
    // conf_.set("xdata.bayesnets.datasetreader.instid", readerInstID_); // not used
    conf_.set("xdata.bayesnets.datasetreader.variablenames", variableNames_);

    conf_.setBoolean("mapred.compress.map.output", true); // compress intermediate data
    conf_.set("mapred.output.compression.type", CompressionType.BLOCK.toString()); // by block, to keep splittable
    conf_.setClass("mapred.map.output.compression.codec", GzipCodec.class, CompressionCodec.class);

    // for debugging               
    conf_.set("keep.failed.task.files", "true");
    conf_.set("keep.failed.task.pattern", "*");

    Job job = new Job(conf_);

    job.setJarByClass(AttributeValueHistogram.class); // use this jar
    job.setJobName("Collect value histograms by attribute");

    FileInputFormat.addInputPath(job, new Path(inPath_));

    int rnd = (new Random()).nextInt();
    lastWorkingDir_ = job.getWorkingDirectory().toUri();
    s_logger.info("Job working directory is " + lastWorkingDir_);
    String tempDirName = job.getWorkingDirectory() + "/tmp/attvalhist" + rnd + ".tmp";
    s_logger.info("Temp files in directory " + tempDirName);
    FileOutputFormat.setOutputPath(job, new Path(tempDirName));

    job.setMapperClass(AttributeValueHistogramMapper.class);
    job.setCombinerClass(AttributeValueHistogramReducer.class);
    job.setReducerClass(AttributeValueHistogramReducer.class);

    // set both the map and reduce in/out classes
    job.setOutputKeyClass(Text.class); // the name of the attribute
    job.setOutputValueClass(MapWritable.class); // Value -> count map
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    // run'em
    int result = job.waitForCompletion(true) ? 0 : 16;

    // retain the temp file, collect the output      
    attributeValues_ = new TreeMap<String, Map<String, Integer>>();

    FileSystem fs = FileSystem.get(conf_);
    SequenceFile.Reader reader = null;

    Path resPath = new Path(tempDirName);
    FileStatus[] stats = fs.listStatus(resPath);

    // read all output files
    for (FileStatus stat : stats) {
        if (stat.getPath().toUri().toString().contains("part-r-"))
            try {
                s_logger.info("Reading results from " + stat.getPath());
                reader = new SequenceFile.Reader(fs, stat.getPath(), conf_);
                // Text key = (Text) ReflectionUtils.newInstance(reader.getKeyClass(), conf_);
                // MapWritable value = (MapWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf_);
                Text key = new Text();
                MapWritable value = new MapWritable();

                while (reader.next(key, value)) {
                    TreeMap<String, Integer> valueCounts = new TreeMap<String, Integer>();
                    for (Writable attValue : value.keySet()) {
                        valueCounts.put(((Text) attValue).toString(),
                                ((IntWritable) (value.get(attValue))).get());
                    }
                    attributeValues_.put(key.toString(), valueCounts);
                }
            } finally {
                IOUtils.closeStream(reader);
            }
    }

    fs.deleteOnExit(resPath);

    return result;
}

From source file:smile.wide.facebook.ExperimentDriver.java

License:Apache License

private int inference() {
    try {// www  . jav  a2 s.  c  om
        DistributedCache.createSymlink(conf_);
        try {
            DistributedCache.addCacheFile(new URI(libHDFSPath_ + "/smile.jar#smile.jar"), conf_);
            DistributedCache.addCacheFile(new URI(libHDFSPath_ + "/libjsmile.so#libjsmile.so"), conf_);
            DistributedCache.addCacheFile(
                    new URI(jobHDFSPath_ + "/tmp/" + modifiedNetwork_ + "#" + basename(modifiedNetwork_)),
                    conf_);
        } catch (URISyntaxException e) {
            s_logger.fatal("Bad URL for modifed network file.");
            return -12;
        }

        // the principle for whether to use string column names or integer column indexes:
        // - when talking about variables in the BN, use string names
        // - when talking about data munging, use column indexes.

        // configure the inference task
        conf_.set("xdata.bayesnets.networkfile", basename(modifiedNetwork_));
        conf_.set("xdata.bayesnets.datasetreader.class", FacebookCSVReader.class.getName());
        conf_.set("xdata.bayesnets.datasetreader.filter", "3,5,7,10,11,12");
        conf_.set("xdata.bayesnets.datasetreader.variablenames",
                "FirstName,MiddleName,Sex,IsAppUser,LikesCount,FriendCount");
        conf_.set("xdata.bayesnets.datasetreader.instid", "1");
        conf_.set("xdata.bayesnets.queryvariable", "Age");

        Job job = new Job(conf_);

        job.setJarByClass(ExperimentDriver.class); // use this jar
        job.setJobName("Facebook Inference Performance Test");

        FileInputFormat.addInputPath(job, new Path(inPath_));
        FileOutputFormat.setOutputPath(job, new Path(outPath_));

        job.setMapperClass(PerInstanceInferenceMapper.class);
        // there need not be any reducer
        // job.setReducerClass(PerInstanceInferenceReducer.class);

        // set both the map and reduce in/out classes
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(DoubleWritable.class);
        // but redefine them for the mapper      
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(DoubleArrayWritable.class);

        s_logger.info("Job working directory is " + job.getWorkingDirectory());

        return job.waitForCompletion(true) ? 0 : 1;

    } catch (IOException e) {
        System.err.println("Something went badly wrong in IO.");
        System.exit(2);
    } catch (InterruptedException e) {
        System.err.println("Job interrupted.");
        e.printStackTrace();
    } catch (ClassNotFoundException e) {
        System.err.println("ClassNotFound exception.");
        e.printStackTrace();
    }

    return 2;
}