List of usage examples for org.apache.hadoop.mapreduce Job getWorkingDirectory
public Path getWorkingDirectory() throws IOException
From source file:com.ailk.oci.ocnosql.tools.load.single.SingleColumnImportTsv.java
License:Apache License
/** * Configure a MapReduce Job to perform an incremental load into the given * table. This/*from w w w.j a va 2 s .com*/ * <ul> * <li>Inspects the table to configure a total order partitioner</li> * <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li> * <li>Sets the number of reduce tasks to match the current number of regions</li> * <li>Sets the output key/value class to match HFileOutputFormat's requirements</li> * <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or * PutSortReducer)</li> * </ul> * The user should be sure to set the map output value class to either KeyValue or Put before * running this function. */ public static void configureIncrementalLoad(Job job, HTable table) throws IOException { Configuration conf = job.getConfiguration(); Class<? extends Partitioner> topClass; try { topClass = getTotalOrderPartitionerClass(); } catch (ClassNotFoundException e) { throw new IOException("Failed getting TotalOrderPartitioner", e); } //partition job.setPartitionerClass(topClass); //Set the key class for the job output data job.setOutputKeyClass(ImmutableBytesWritable.class); //Set the value class for job outputs job.setOutputValueClass(KeyValue.class); //outputformatHfile job.setOutputFormatClass(HFileOutputFormat2.class); // Based on the configured map output class, set the correct reducer to properly // sort the incoming values. // TODO it would be nice to pick one or the other of these formats. if (KeyValue.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(KeyValueSortReducer.class); } else if (Put.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(SingleColumnReducer.class); } else { LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass()); } LOG.info("Looking up current regions for table " + table); //?regionstarkey List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table); LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count"); //?region?reduce? job.setNumReduceTasks(startKeys.size()); Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_" + UUID.randomUUID()); LOG.info("Writing partition information to " + partitionsPath); FileSystem fs = partitionsPath.getFileSystem(conf); writePartitions(conf, partitionsPath, startKeys); partitionsPath.makeQualified(fs); URI cacheUri; try { // Below we make explicit reference to the bundled TOP. Its cheating. // We are assume the define in the hbase bundled TOP is as it is in // hadoop (whether 0.20 or 0.22, etc.) /* cacheUri = new URI(partitionsPath.toString() + "#" + org.apache.hadoop.hbase.mapreduce.hadoopbackport.TotalOrderPartitioner.DEFAULT_PATH); */ cacheUri = new URI(partitionsPath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH); } catch (URISyntaxException e) { throw new IOException(e); } DistributedCache.addCacheFile(cacheUri, conf); DistributedCache.createSymlink(conf); // Set compression algorithms based on column families configureCompression(table, conf); TableMapReduceUtil.addDependencyJars(job); LOG.info("Incremental table output configured."); }
From source file:com.ci.backports.hadoop.hbase.ZHFileOutputFormat.java
License:Apache License
/** * Configure a MapReduce Job to perform an incremental load into the given * table. This//from w w w.ja va2 s . co m * <ul> * <li>Inspects the table to configure a total order partitioner</li> * <li>Uploads the partitions file to the cluster and adds it to the DistributedCache</li> * <li>Sets the number of reduce tasks to match the current number of regions</li> * <li>Sets the output key/value class to match ZHFileOutputFormat's requirements</li> * <li>Sets the reducer up to perform the appropriate sorting (either KeyValueSortReducer or * ZPutSortReducer)</li> * </ul> * The user should be sure to set the map output value class to either KeyValue or Put before * running this function. */ public static void configureIncrementalLoad(Job job, HTable table) throws IOException { Configuration conf = job.getConfiguration(); job.setPartitionerClass(TotalOrderPartitioner.class); job.setOutputKeyClass(ImmutableBytesWritable.class); job.setOutputValueClass(KeyValue.class); job.setOutputFormatClass(ZHFileOutputFormat.class); // Based on the configured map output class, set the correct reducer to properly // sort the incoming values. // TODO it would be nice to pick one or the other of these formats. if (KeyValue.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(KeyValueSortReducer.class); } else if (Put.class.equals(job.getMapOutputValueClass())) { job.setReducerClass(ZPutSortReducer.class); } else { LOG.warn("Unknown map output value type:" + job.getMapOutputValueClass()); } LOG.info("Looking up current regions for table " + table); List<ImmutableBytesWritable> startKeys = getRegionStartKeys(table); LOG.info("Configuring " + startKeys.size() + " reduce partitions " + "to match current region count"); job.setNumReduceTasks(startKeys.size()); Path partitionsPath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis()); LOG.info("Writing partition information to " + partitionsPath); FileSystem fs = partitionsPath.getFileSystem(conf); writePartitions(conf, partitionsPath, startKeys); partitionsPath.makeQualified(fs); URI cacheUri; try { cacheUri = new URI(partitionsPath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH); } catch (URISyntaxException e) { throw new IOException(e); } DistributedCache.addCacheFile(cacheUri, conf); DistributedCache.createSymlink(conf); LOG.info("Incremental table output configured."); }
From source file:com.moz.fiji.mapreduce.output.HFileMapReduceJobOutput.java
License:Apache License
/** * Configures the partitioner for generating HFiles. * * <p>Each generated HFile should fit within a region of of the target table. * Additionally, it's optimal to have only one HFile to load into each region, since a * read from that region will require reading from each HFile under management (until * compaction happens and merges them all back into one HFile).</p> * * <p>To achieve this, we configure a TotalOrderPartitioner that will partition the * records output from the Mapper based on their rank in a total ordering of the * keys. The <code>startKeys</code> argument should contain a list of the first key in * each of those partitions.</p>// www . j a v a 2s .c o m * * @param job The job to configure. * @param startKeys A list of keys that will mark the boundaries between the partitions * for the sorted map output records. * @throws IOException If there is an error. */ public static void configurePartitioner(Job job, List<HFileKeyValue> startKeys) throws IOException { FijiMRPlatformBridge.get().setTotalOrderPartitionerClass(job); LOG.info("Configuring " + startKeys.size() + " reduce partitions."); job.setNumReduceTasks(startKeys.size()); // Write the file that the TotalOrderPartitioner reads to determine where to partition records. Path partitionFilePath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis()); LOG.info("Writing partition information to " + partitionFilePath); final FileSystem fs = partitionFilePath.getFileSystem(job.getConfiguration()); partitionFilePath = partitionFilePath.makeQualified(fs); writePartitionFile(job.getConfiguration(), partitionFilePath, startKeys); // Add it to the distributed cache. try { final URI cacheUri = new URI(partitionFilePath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH); DistributedCache.addCacheFile(cacheUri, job.getConfiguration()); } catch (URISyntaxException e) { throw new IOException(e); } DistributedCache.createSymlink(job.getConfiguration()); }
From source file:io.druid.indexer.updater.HadoopConverterJob.java
License:Apache License
public static void cleanup(Job job) throws IOException { final Path jobDir = getJobPath(job.getJobID(), job.getWorkingDirectory()); final FileSystem fs = jobDir.getFileSystem(job.getConfiguration()); fs.delete(jobDir, true);// w ww . j a v a 2 s . co m fs.delete(getJobClassPathDir(job.getJobName(), job.getWorkingDirectory()), true); }
From source file:io.druid.indexer.updater.HadoopConverterJob.java
License:Apache License
public List<DataSegment> run() throws IOException { final JobConf jobConf = new JobConf(); jobConf.setKeepFailedTaskFiles(false); for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) { jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()"); }// ww w. jav a 2 s. c o m final List<DataSegment> segments = converterConfig.getSegments(); if (segments.isEmpty()) { throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource()); } converterConfigIntoConfiguration(converterConfig, segments, jobConf); jobConf.setNumReduceTasks(0);// Map only. Number of map tasks determined by input format jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache())); setJobName(jobConf, segments); if (converterConfig.getJobPriority() != null) { jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority())); } final Job job = Job.getInstance(jobConf); job.setInputFormatClass(ConfigInputFormat.class); job.setMapperClass(ConvertingMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setMapSpeculativeExecution(false); job.setOutputFormatClass(ConvertingOutputFormat.class); JobHelper.setupClasspath(JobHelper.distributedClassPath(jobConf.getWorkingDirectory()), JobHelper.distributedClassPath(getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())), job); Throwable throwable = null; try { job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); final boolean success = job.waitForCompletion(true); if (!success) { final TaskReport[] reports = job.getTaskReports(TaskType.MAP); if (reports != null) { for (final TaskReport report : reports) { log.error("Error in task [%s] : %s", report.getTaskId(), Arrays.toString(report.getDiagnostics())); } } return null; } try { loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue(); writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue(); } catch (IOException ex) { log.error(ex, "Could not fetch counters"); } final JobID jobID = job.getJobID(); final Path jobDir = getJobPath(jobID, job.getWorkingDirectory()); final FileSystem fs = jobDir.getFileSystem(job.getConfiguration()); final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true); final List<Path> goodPaths = new ArrayList<>(); while (it.hasNext()) { final LocatedFileStatus locatedFileStatus = it.next(); if (locatedFileStatus.isFile()) { final Path myPath = locatedFileStatus.getPath(); if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) { goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY)); } } } if (goodPaths.isEmpty()) { log.warn("No good data found at [%s]", jobDir); return null; } final List<DataSegment> returnList = ImmutableList .copyOf(Lists.transform(goodPaths, new Function<Path, DataSegment>() { @Nullable @Override public DataSegment apply(final Path input) { try { if (!fs.exists(input)) { throw new ISE("Somehow [%s] was found but [%s] is missing at [%s]", ConvertingOutputFormat.DATA_SUCCESS_KEY, ConvertingOutputFormat.DATA_FILE_KEY, jobDir); } } catch (final IOException e) { throw Throwables.propagate(e); } try (final InputStream stream = fs.open(input)) { return HadoopDruidConverterConfig.jsonMapper.readValue(stream, DataSegment.class); } catch (final IOException e) { throw Throwables.propagate(e); } } })); if (returnList.size() == segments.size()) { return returnList; } else { throw new ISE( "Tasks reported success but result length did not match! Expected %d found %d at path [%s]", segments.size(), returnList.size(), jobDir); } } catch (InterruptedException | ClassNotFoundException e) { RuntimeException exception = Throwables.propagate(e); throwable = exception; throw exception; } catch (Throwable t) { throwable = t; throw t; } finally { try { cleanup(job); } catch (IOException e) { if (throwable != null) { throwable.addSuppressed(e); } else { log.error(e, "Could not clean up job [%s]", job.getJobID()); } } } }
From source file:org.apache.druid.indexer.updater.HadoopConverterJob.java
License:Apache License
public static void cleanup(Job job) throws IOException { final Path jobDir = getJobPath(job.getJobID(), job.getWorkingDirectory()); final FileSystem fs = jobDir.getFileSystem(job.getConfiguration()); RuntimeException e = null;// w w w . j av a2 s . c o m try { JobHelper.deleteWithRetry(fs, jobDir, true); } catch (RuntimeException ex) { e = ex; } try { JobHelper.deleteWithRetry(fs, getJobClassPathDir(job.getJobName(), job.getWorkingDirectory()), true); } catch (RuntimeException ex) { if (e == null) { e = ex; } else { e.addSuppressed(ex); } } if (e != null) { throw e; } }
From source file:org.apache.druid.indexer.updater.HadoopConverterJob.java
License:Apache License
public List<DataSegment> run() throws IOException { final JobConf jobConf = new JobConf(); jobConf.setKeepFailedTaskFiles(false); for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) { jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()"); }//from ww w . j a v a 2s.c o m final List<DataSegment> segments = converterConfig.getSegments(); if (segments.isEmpty()) { throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource()); } converterConfigIntoConfiguration(converterConfig, segments, jobConf); jobConf.setNumReduceTasks(0); // Map only. Number of map tasks determined by input format jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache())); setJobName(jobConf, segments); if (converterConfig.getJobPriority() != null) { jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority())); } final Job job = Job.getInstance(jobConf); job.setInputFormatClass(ConfigInputFormat.class); job.setMapperClass(ConvertingMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setMapSpeculativeExecution(false); job.setOutputFormatClass(ConvertingOutputFormat.class); JobHelper.setupClasspath(JobHelper.distributedClassPath(jobConf.getWorkingDirectory()), JobHelper.distributedClassPath(getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())), job); Throwable throwable = null; try { job.submit(); log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL()); final boolean success = job.waitForCompletion(true); if (!success) { final TaskReport[] reports = job.getTaskReports(TaskType.MAP); if (reports != null) { for (final TaskReport report : reports) { log.error("Error in task [%s] : %s", report.getTaskId(), Arrays.toString(report.getDiagnostics())); } } return null; } try { loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue(); writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue(); } catch (IOException ex) { log.error(ex, "Could not fetch counters"); } final JobID jobID = job.getJobID(); final Path jobDir = getJobPath(jobID, job.getWorkingDirectory()); final FileSystem fs = jobDir.getFileSystem(job.getConfiguration()); final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true); final List<Path> goodPaths = new ArrayList<>(); while (it.hasNext()) { final LocatedFileStatus locatedFileStatus = it.next(); if (locatedFileStatus.isFile()) { final Path myPath = locatedFileStatus.getPath(); if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) { goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY)); } } } if (goodPaths.isEmpty()) { log.warn("No good data found at [%s]", jobDir); return null; } final List<DataSegment> returnList = ImmutableList .copyOf(Lists.transform(goodPaths, new Function<Path, DataSegment>() { @Nullable @Override public DataSegment apply(final Path input) { try { if (!fs.exists(input)) { throw new ISE("Somehow [%s] was found but [%s] is missing at [%s]", ConvertingOutputFormat.DATA_SUCCESS_KEY, ConvertingOutputFormat.DATA_FILE_KEY, jobDir); } } catch (final IOException e) { throw Throwables.propagate(e); } try (final InputStream stream = fs.open(input)) { return HadoopDruidConverterConfig.jsonMapper.readValue(stream, DataSegment.class); } catch (final IOException e) { throw Throwables.propagate(e); } } })); if (returnList.size() == segments.size()) { return returnList; } else { throw new ISE( "Tasks reported success but result length did not match! Expected %d found %d at path [%s]", segments.size(), returnList.size(), jobDir); } } catch (InterruptedException | ClassNotFoundException e) { RuntimeException exception = Throwables.propagate(e); throwable = exception; throw exception; } catch (Throwable t) { throwable = t; throw t; } finally { try { cleanup(job); } catch (IOException e) { if (throwable != null) { throwable.addSuppressed(e); } else { log.error(e, "Could not clean up job [%s]", job.getJobID()); } } } }
From source file:org.kiji.mapreduce.output.HFileMapReduceJobOutput.java
License:Apache License
/** * Configures the partitioner for generating HFiles. * * <p>Each generated HFile should fit within a region of of the target table. * Additionally, it's optimal to have only one HFile to load into each region, since a * read from that region will require reading from each HFile under management (until * compaction happens and merges them all back into one HFile).</p> * * <p>To achieve this, we configure a TotalOrderPartitioner that will partition the * records output from the Mapper based on their rank in a total ordering of the * keys. The <code>startKeys</code> argument should contain a list of the first key in * each of those partitions.</p>/*from www . ja v a2s . c om*/ * * @param job The job to configure. * @param startKeys A list of keys that will mark the boundaries between the partitions * for the sorted map output records. * @throws IOException If there is an error. */ private static void configurePartitioner(Job job, List<HFileKeyValue> startKeys) throws IOException { job.setPartitionerClass(TotalOrderPartitioner.class); LOG.info("Configuring " + startKeys.size() + " reduce partitions."); job.setNumReduceTasks(startKeys.size()); // Write the file that the TotalOrderPartitioner reads to determine where to partition records. Path partitionFilePath = new Path(job.getWorkingDirectory(), "partitions_" + System.currentTimeMillis()); LOG.info("Writing partition information to " + partitionFilePath); final FileSystem fs = partitionFilePath.getFileSystem(job.getConfiguration()); partitionFilePath = partitionFilePath.makeQualified(fs); writePartitionFile(job.getConfiguration(), partitionFilePath, startKeys); // Add it to the distributed cache. try { final URI cacheUri = new URI(partitionFilePath.toString() + "#" + TotalOrderPartitioner.DEFAULT_PATH); DistributedCache.addCacheFile(cacheUri, job.getConfiguration()); } catch (URISyntaxException e) { throw new IOException(e); } DistributedCache.createSymlink(job.getConfiguration()); }
From source file:smile.wide.AttributeValueHistogram.java
License:Apache License
@Override public int run(String[] arg) throws Exception { if (arg.length < 2) { s_logger.fatal("Usage: AttributeValueHistogram <infile> <outfile>"); // TODO: return an error code? }//from ww w. j a v a2s . c o m s_logger.debug("Got " + arg.length + " arguments"); inPath_ = arg[0]; s_logger.info("Input path is " + inPath_); // parse the key-value arguments passed - by now these are the arguments // specific to AttributeValueHistogram for (int i = 1; i < arg.length; ++i) { String[] tokens = arg[i].split("="); if (tokens.length != 2) { s_logger.fatal("Can't parse argument" + arg[i]); } if (tokens[0].equals("xdata.bayesnets.datasetreader.class")) { readerClass_ = tokens[1].trim(); s_logger.debug("Set reader class to " + readerClass_); } else if (tokens[0].equals("xdata.bayesnets.datasetreader.filter")) { readerFilter_ = tokens[1].trim(); s_logger.debug("Set reader filter to " + readerFilter_); } else if (tokens[0].equals("xdata.bayesnets.datasetreader.instid")) { readerInstID_ = tokens[1].trim(); s_logger.debug("Set reader's instance ID column to " + readerInstID_); } else if (tokens[0].equals("xdata.bayesnets.datasetreader.variablenames")) { variableNames_ = tokens[1].trim(); s_logger.debug("Set reader's variable names to " + variableNames_); } else { s_logger.warn("Unknown argument " + arg[i]); } } conf_ = getConf(); // pass the reader class to the mapper, in jobconf // TODO: use setClass here - fails early if wrong, not in the mapper conf_.set("xdata.bayesnets.datasetreader.class", readerClass_); conf_.set("xdata.bayesnets.datasetreader.filter", readerFilter_); // conf_.set("xdata.bayesnets.datasetreader.instid", readerInstID_); // not used conf_.set("xdata.bayesnets.datasetreader.variablenames", variableNames_); conf_.setBoolean("mapred.compress.map.output", true); // compress intermediate data conf_.set("mapred.output.compression.type", CompressionType.BLOCK.toString()); // by block, to keep splittable conf_.setClass("mapred.map.output.compression.codec", GzipCodec.class, CompressionCodec.class); // for debugging conf_.set("keep.failed.task.files", "true"); conf_.set("keep.failed.task.pattern", "*"); Job job = new Job(conf_); job.setJarByClass(AttributeValueHistogram.class); // use this jar job.setJobName("Collect value histograms by attribute"); FileInputFormat.addInputPath(job, new Path(inPath_)); int rnd = (new Random()).nextInt(); lastWorkingDir_ = job.getWorkingDirectory().toUri(); s_logger.info("Job working directory is " + lastWorkingDir_); String tempDirName = job.getWorkingDirectory() + "/tmp/attvalhist" + rnd + ".tmp"; s_logger.info("Temp files in directory " + tempDirName); FileOutputFormat.setOutputPath(job, new Path(tempDirName)); job.setMapperClass(AttributeValueHistogramMapper.class); job.setCombinerClass(AttributeValueHistogramReducer.class); job.setReducerClass(AttributeValueHistogramReducer.class); // set both the map and reduce in/out classes job.setOutputKeyClass(Text.class); // the name of the attribute job.setOutputValueClass(MapWritable.class); // Value -> count map job.setOutputFormatClass(SequenceFileOutputFormat.class); // run'em int result = job.waitForCompletion(true) ? 0 : 16; // retain the temp file, collect the output attributeValues_ = new TreeMap<String, Map<String, Integer>>(); FileSystem fs = FileSystem.get(conf_); SequenceFile.Reader reader = null; Path resPath = new Path(tempDirName); FileStatus[] stats = fs.listStatus(resPath); // read all output files for (FileStatus stat : stats) { if (stat.getPath().toUri().toString().contains("part-r-")) try { s_logger.info("Reading results from " + stat.getPath()); reader = new SequenceFile.Reader(fs, stat.getPath(), conf_); // Text key = (Text) ReflectionUtils.newInstance(reader.getKeyClass(), conf_); // MapWritable value = (MapWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf_); Text key = new Text(); MapWritable value = new MapWritable(); while (reader.next(key, value)) { TreeMap<String, Integer> valueCounts = new TreeMap<String, Integer>(); for (Writable attValue : value.keySet()) { valueCounts.put(((Text) attValue).toString(), ((IntWritable) (value.get(attValue))).get()); } attributeValues_.put(key.toString(), valueCounts); } } finally { IOUtils.closeStream(reader); } } fs.deleteOnExit(resPath); return result; }
From source file:smile.wide.facebook.ExperimentDriver.java
License:Apache License
private int inference() { try {// www . jav a2 s. c om DistributedCache.createSymlink(conf_); try { DistributedCache.addCacheFile(new URI(libHDFSPath_ + "/smile.jar#smile.jar"), conf_); DistributedCache.addCacheFile(new URI(libHDFSPath_ + "/libjsmile.so#libjsmile.so"), conf_); DistributedCache.addCacheFile( new URI(jobHDFSPath_ + "/tmp/" + modifiedNetwork_ + "#" + basename(modifiedNetwork_)), conf_); } catch (URISyntaxException e) { s_logger.fatal("Bad URL for modifed network file."); return -12; } // the principle for whether to use string column names or integer column indexes: // - when talking about variables in the BN, use string names // - when talking about data munging, use column indexes. // configure the inference task conf_.set("xdata.bayesnets.networkfile", basename(modifiedNetwork_)); conf_.set("xdata.bayesnets.datasetreader.class", FacebookCSVReader.class.getName()); conf_.set("xdata.bayesnets.datasetreader.filter", "3,5,7,10,11,12"); conf_.set("xdata.bayesnets.datasetreader.variablenames", "FirstName,MiddleName,Sex,IsAppUser,LikesCount,FriendCount"); conf_.set("xdata.bayesnets.datasetreader.instid", "1"); conf_.set("xdata.bayesnets.queryvariable", "Age"); Job job = new Job(conf_); job.setJarByClass(ExperimentDriver.class); // use this jar job.setJobName("Facebook Inference Performance Test"); FileInputFormat.addInputPath(job, new Path(inPath_)); FileOutputFormat.setOutputPath(job, new Path(outPath_)); job.setMapperClass(PerInstanceInferenceMapper.class); // there need not be any reducer // job.setReducerClass(PerInstanceInferenceReducer.class); // set both the map and reduce in/out classes job.setOutputKeyClass(Text.class); job.setOutputValueClass(DoubleWritable.class); // but redefine them for the mapper job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(DoubleArrayWritable.class); s_logger.info("Job working directory is " + job.getWorkingDirectory()); return job.waitForCompletion(true) ? 0 : 1; } catch (IOException e) { System.err.println("Something went badly wrong in IO."); System.exit(2); } catch (InterruptedException e) { System.err.println("Job interrupted."); e.printStackTrace(); } catch (ClassNotFoundException e) { System.err.println("ClassNotFound exception."); e.printStackTrace(); } return 2; }