List of usage examples for org.apache.hadoop.mapreduce Job getCounters
public Counters getCounters() throws IOException
From source file:org.lilyproject.mapreduce.test.MapReduceTest.java
License:Apache License
private long getTotalInputRecords(Job job) throws IOException { return job.getCounters().findCounter("org.apache.hadoop.mapreduce.TaskCounter", "MAP_INPUT_RECORDS") .getValue();/*from w ww.ja v a 2s.c o m*/ }
From source file:org.oclc.firefly.hadoop.backup.Backup.java
License:Apache License
/** * Performs a complete copy of the source hbase to the given destination * @param tables The names of the tables to backup * @param maxTries The maximum number of times to try to copy regions. * @return True if successful, false otherwise * @throws IOException If failed to interact with Hadoop * @throws ClassNotFoundException //from w w w. j a va 2 s .co m * @throws InterruptedException */ public boolean doMajorCopy(String[] tables, int maxTries) throws IOException, InterruptedException, ClassNotFoundException { boolean ret = false; String username = getUsername(); short replication = (short) getInitialReplication(); // Get a list of regions from HBase // Then filter out the regions we are not extracting, and group them by table List<CatalogRow> regions = getHBaseRegions(srcConf); Map<String, List<CatalogRow>> filtered = groupAndFilter(regions, tables); List<Pair<String, HRegionInfo>> mapperInput = new ArrayList<Pair<String, HRegionInfo>>(); // Prepare the input for the mappers to use // This creates a list of region server and region pairs LOG.info("Exporting the following tables:"); for (Entry<String, List<CatalogRow>> entry : filtered.entrySet()) { String tablename = entry.getKey(); List<CatalogRow> rows = entry.getValue(); LOG.info(". " + tablename); for (CatalogRow r : rows) { String regionServer = r.getHost() + ":" + r.getPort(); HRegionInfo region = r.getHRegionInfo(); mapperInput.add(Pair.newPair(regionServer, region)); } } // Make sure we write to a directory that does not exist backupDirectoryPath = createBackupDirectory(getCurrentDateString()); LOG.info("Starting backup path: " + backupDirectoryPath); // Copy the .tableinfo files for the tables we are extracting // These files are not copied by the MR job as it only focuses on regions List<FileStatus> tableInfoFiles = getTableInfoFiles(srcFs, filtered); for (FileStatus file : tableInfoFiles) { Path srcFilePath = file.getPath(); Path relPath = new Path(BackupUtils.getFsRelativePath(srcFs, srcFilePath)); Path dstFilePath = new Path(backupDirectoryPath.toString() + relPath.toString()); BackupUtils.copy(srcFs, srcFilePath, dstFs, dstFilePath, buffer, username, replication); } // Dispatch MR job and monitor // Retry regions if necessary if (mapperInput.size() > 0) { int tries = 0; while (!ret && (maxTries == 0 || tries < maxTries)) { if (getNumMapTasks() > mapperInput.size()) { setNumMapTasks(mapperInput.size()); LOG.info("Not enough regions. Reducing number of map tasks"); } // Generate a list of mapper input files and create job List<Path> sourceFiles = createMapperInputSequenceFiles(mapperInput, getNumMapTasks(), srcFs, tries); Job job = createMRJob(srcConf, dstConf, sourceFiles, backupDirectoryPath, tries); LOG.info(job.getJobName()); LOG.info("--------------------------------------------------"); LOG.info("Number of regions : " + mapperInput.size()); LOG.info("Number of map tasks: " + getNumMapTasks()); LOG.info("Mapper input path : " + getMapInputDirectory(tries)); LOG.info("Mapper output path : " + FileOutputFormat.getOutputPath(job)); LOG.info("--------------------------------------------------"); job.waitForCompletion(true); if (job.isSuccessful()) { // Check if any regions failed Counters counters = job.getCounters(); Counter failedCounter = counters.findCounter("Backup", "FailedRegions"); long failed = failedCounter.getValue(); if (failed > 0) { LOG.info("Number of failed regions: " + failed + "."); // get a fresh list of regions to copy List<Pair<String, HRegionInfo>> failedRegions = getFailedRegions(srcFs, srcConf, tries); addCopiedRegions(mapperInput, failedRegions); mapperInput = getRemainingRegions(mapperInput, tables); for (Pair<String, HRegionInfo> pair : mapperInput) { LOG.info("Retry: " + pair.getSecond()); } if (mapperInput.size() == 0) { ret = true; backupDirectoryPath = appendEndTime(backupDirectoryPath); LOG.warn("No regions left to copy, but expected to copy more. " + "Please inspect logs/files manually for errors"); } } else { ret = true; addCopiedRegions(mapperInput, null); backupDirectoryPath = appendEndTime(backupDirectoryPath); LOG.info("MR job finished successfully"); } } else { LOG.error("An unexpected error occurred during the MR job. Please see MR logs."); break; } tries++; } if (ret) { if (verifyCopiedRegions()) { LOG.info("Verification passed succesfully"); } else { ret = false; LOG.info("Verification failed. Please inspect errors manually"); } } else { LOG.info("No attempts left. Try setting -n to a higher value, or setting it to 0"); } } if (ret) { // Set replication factor of backup directory to default. // This may not be the best solution, but let built-in shell take care of it // because it can do it recursively with out us having to rediscover all the files short finalReplication = (short) getFinalReplication(); if (replication != finalReplication) { FsShell shell = new FsShell(dstConf); String[] repArgs = { "-setrep", "-R", "-w", "" + finalReplication, backupDirectoryPath.toString() }; try { LOG.info("Setting final replication factor of backup files to " + finalReplication); shell.run(repArgs); } catch (Exception e) { LOG.warn("Could not set replication factor of backup files to " + finalReplication); } } } return ret; }
From source file:org.opencloudengine.flamingo.mapreduce.util.CounterUtils.java
License:Apache License
/** * Job? ?? Counter Map ? ./*from ww w. j a v a2s.c om*/ * Key? <tt>GROUP_COUNTER</tt> ? Group Name? * <tt>CLEAN</tt>?, Counter <tt>VALID</tt>? Key * <tt>CLEAN_VALID</tt> ?. * * @param job Hadoop Job * @return Counter ? ? Map */ public static Map<String, String> getCounters(Job job) { Map<String, String> resultMap = new HashMap<String, String>(); try { Counters counters = job.getCounters(); Iterable<String> groupNames = counters.getGroupNames(); Iterator<String> groupIterator = groupNames.iterator(); while (groupIterator.hasNext()) { String groupName = groupIterator.next(); CounterGroup group = counters.getGroup(groupName); Iterator<Counter> counterIterator = group.iterator(); while (counterIterator.hasNext()) { Counter counter = counterIterator.next(); logger.info("[{}] {} = {}", new Object[] { group.getName(), counter.getName(), counter.getValue() }); String realName = HadoopMetrics.getMetricName(group.getName() + "_" + counter.getName()); if (!StringUtils.isEmpty(realName)) { resultMap.put(realName, String.valueOf(counter.getValue())); } } } } catch (Exception ex) { } return resultMap; }
From source file:org.openflamingo.mapreduce.util.CounterUtils.java
License:Apache License
/** * Job? ?? Counter Map ? .//from w w w .j a v a2s. c om * Key? <tt>GROUP_COUNTER</tt> ? Group Name? * <tt>CLEAN</tt>?, Counter <tt>VALID</tt>? Key * <tt>CLEAN_VALID</tt> ?. * * @param job Hadoop Job * @return Counter ? ? Map */ public static Map<String, String> getCounters(Job job) { Map<String, String> resultMap = new HashMap<String, String>(); try { Counters counters = job.getCounters(); Collection<String> groupNames = counters.getGroupNames(); Iterator<String> groupIterator = groupNames.iterator(); while (groupIterator.hasNext()) { String groupName = groupIterator.next(); CounterGroup group = counters.getGroup(groupName); Iterator<Counter> counterIterator = group.iterator(); while (counterIterator.hasNext()) { Counter counter = counterIterator.next(); logger.info("[{}] {} = {}", new Object[] { group.getName(), counter.getName(), counter.getValue() }); String realName = HadoopMetrics.getMetricName(group.getName() + "_" + counter.getName()); if (!StringUtils.isEmpty(realName)) { resultMap.put(realName, String.valueOf(counter.getValue())); } } } } catch (Exception ex) { } return resultMap; }
From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java
License:Open Source License
protected boolean runDictionaryJob() throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException { boolean jobOK; Job job = null; BufferedWriter bufferedWriter; // if output path exists... if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) { if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true); } else { // ... and option not provided, fail System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath()); System.out.println("Select other path or use option -dd to overwrite"); System.exit(-1);//from ww w . jav a 2 s . co m } } // Sample the SequenceInputFormat to do TotalSort and create final output job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 2"); job.setJarByClass(HDTBuilderDriver.class); System.out.println("samples = " + this.conf.getDictionarySamplesPath()); System.out.println("output = " + this.conf.getDictionaryOutputPath()); FileInputFormat.addInputPath(job, this.conf.getDictionarySamplesPath()); FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath()); job.setInputFormatClass(SequenceFileInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); // Identity Mapper // job.setMapperClass(Mapper.class); job.setCombinerClass(DictionaryCombiner.class); job.setPartitionerClass(TotalOrderPartitioner.class); job.setReducerClass(DictionaryReducer.class); job.setNumReduceTasks(this.conf.getDictionaryReducers()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); System.out.println("Sampling started"); InputSampler.writePartitionFile(job, new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability())); String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration()); URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH); DistributedCache.addCacheFile(partitionUri, job.getConfiguration()); DistributedCache.createSymlink(job.getConfiguration()); System.out.println("Sampling finished"); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); jobOK = job.waitForCompletion(true); this.numShared = job.getCounters().findCounter(Counters.Shared).getValue(); this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue(); this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue(); this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue(); bufferedWriter = new BufferedWriter( new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile()))); bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n"); bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n"); bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n"); bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n"); bufferedWriter.close(); return jobOK; }
From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java
License:Open Source License
protected boolean runDictionaryJobWithOneJob() throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException { boolean jobOK; Job job = null; BufferedWriter bufferedWriter; // if input path does not exists, fail if (!this.inputFS.exists(this.conf.getInputPath())) { System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath()); System.exit(-1);//from w w w. ja v a 2 s . c om } // if output path exists... if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) { if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true); } else { // ... and option not provided, fail System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath()); System.out.println("Select other path or use option -dd to overwrite"); System.exit(-1); } } // Launch job job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName()); job.setJarByClass(HDTBuilderDriver.class); FileInputFormat.addInputPath(job, this.conf.getInputPath()); FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath()); job.setInputFormatClass(LzoTextInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); job.setMapperClass(DictionaryMapper.class); job.setCombinerClass(DictionaryCombiner.class); job.setReducerClass(DictionaryReducer.class); job.setNumReduceTasks(this.conf.getDictionaryReducers()); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class, Text.class, NullWritable.class); MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class, Text.class, NullWritable.class); jobOK = job.waitForCompletion(true); this.numShared = job.getCounters().findCounter(Counters.Shared).getValue(); this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue(); this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue(); this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue(); bufferedWriter = new BufferedWriter( new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile()))); bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n"); bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n"); bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n"); bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n"); bufferedWriter.close(); return jobOK; }
From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java
License:Open Source License
protected boolean runTriplesJobSampling() throws ClassNotFoundException, IOException, InterruptedException { Job job = null; boolean jobOK; BufferedWriter bufferedWriter; // if input path does not exists, fail if (!this.inputFS.exists(this.conf.getInputPath())) { System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath()); System.exit(-1);/*from w w w . j a v a2 s . c o m*/ } // if dictionary output path does not exists, fail if (!this.dictionaryFS.exists(this.conf.getInputPath())) { System.out.println("Dictionary output path does not exist: " + this.conf.getInputPath()); System.exit(-1); } // if samples path exists, fail if (this.dictionaryFS.exists(this.conf.getTriplesSamplesPath())) { if (this.conf.getDeleteTriplesSamplesPath()) { // ... and option // provided, delete // recursively this.dictionaryFS.delete(this.conf.getTriplesSamplesPath(), true); } else { // ... and option not provided, fail System.out.println("Triples samples path does exist: " + this.conf.getTriplesSamplesPath()); System.out.println("Select other path or use option -dst to overwrite"); System.exit(-1); } } this.conf.setProperty("mapred.child.java.opts", "-XX:ErrorFile=/home/hadoop/tmp/hs_err_pid%p.log -Xmx2500m"); // Job to create a SequenceInputFormat job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 1"); job.setJarByClass(HDTBuilderDriver.class); FileInputFormat.addInputPath(job, this.conf.getInputPath()); FileOutputFormat.setOutputPath(job, this.conf.getTriplesSamplesPath()); job.setInputFormatClass(LzoTextInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); job.setMapperClass(TriplesSPOMapper.class); job.setSortComparatorClass(TripleSPOComparator.class); job.setGroupingComparatorClass(TripleSPOComparator.class); job.setMapOutputKeyClass(TripleSPOWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(TripleSPOWritable.class); job.setOutputValueClass(NullWritable.class); job.setNumReduceTasks(this.conf.getTriplesReducers()); DistributedCache.addCacheFile(this.conf.getDictionaryFile().toUri(), job.getConfiguration()); SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); jobOK = job.waitForCompletion(true); this.numTriples = job.getCounters().findCounter(Counters.Triples).getValue(); bufferedWriter = new BufferedWriter( new OutputStreamWriter(this.triplesFS.create(this.conf.getTriplesCountersFile()))); bufferedWriter.write(this.numTriples.toString() + "\n"); bufferedWriter.close(); return jobOK; }
From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java
License:Open Source License
protected boolean runTriplesJobWithOneJob() throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException { Job job = null; boolean jobOK; BufferedWriter bufferedWriter; // if input path does not exists, fail if (!this.inputFS.exists(this.conf.getInputPath())) { System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath()); System.exit(-1);//from ww w. ja va 2 s. c o m } // if dictionary output path does not exists, fail if (!this.dictionaryFS.exists(this.conf.getInputPath())) { System.out.println("Dictionary output path does not exist: " + this.conf.getInputPath()); System.exit(-1); } // if triples output path exists... if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) { if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively this.triplesFS.delete(this.conf.getTriplesOutputPath(), true); } else { // ... and option not provided, fail System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath()); System.out.println("Select other path or use option -dt to overwrite"); System.exit(-1); } } // Launch job this.conf.setProperty("mapred.child.java.opts", "-XX:ErrorFile=/home/hadoop/tmp/hs_err_pid%p.log -Xmx2500m"); job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName()); job.setJarByClass(HDTBuilderDriver.class); FileInputFormat.addInputPath(job, this.conf.getInputPath()); FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath()); job.setInputFormatClass(LzoTextInputFormat.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); job.setMapperClass(TriplesSPOMapper.class); job.setSortComparatorClass(TripleSPOComparator.class); job.setMapOutputKeyClass(TripleSPOWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setNumReduceTasks(this.conf.getTriplesReducers()); job.setOutputKeyClass(TripleSPOWritable.class); job.setOutputValueClass(NullWritable.class); DistributedCache.addCacheFile(this.conf.getDictionaryFile().toUri(), job.getConfiguration()); // DistributedCache.addCacheFile(this.conf.getDictionaryMapFile().toUri(), job.getConfiguration()); // DistributedCache.addCacheFile(this.conf.getDictionaryReduceFile().toUri(), job.getConfiguration()); jobOK = job.waitForCompletion(true); this.numTriples = job.getCounters().findCounter(Counters.Triples).getValue(); bufferedWriter = new BufferedWriter( new OutputStreamWriter(this.triplesFS.create(this.conf.getTriplesCountersFile()))); bufferedWriter.write(this.numTriples.toString() + "\n"); bufferedWriter.close(); return jobOK; }
From source file:org.sleuthkit.hadoop.clustering.SequenceFsEntryText.java
License:Open Source License
/** Runs a mapreduce task which will iterate over the HBase entries table * using FSEntry. It will output files on the hdd with the identifier * id that have grep matches to one or more sequence files in outDir. *//*from ww w .ja va 2 s. co m*/ public static boolean runPipeline(String outDir, String id, String friendlyName) { try { Job job = SKJobFactory.createJob(id, friendlyName, JobNames.GREP_MATCHED_FILES_OUT); job.setJarByClass(SequenceFsEntryText.class); job.setMapperClass(SequenceFsEntryTextMapper.class); // We don't need a combiner or a reducer for this job. We aren't // writing anything out either. job.setNumReduceTasks(0); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); job.setInputFormatClass(FsEntryHBaseInputFormat.class); FsEntryHBaseInputFormat.setupJob(job, id); job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, new Path(outDir)); // we want to search the default grep results. If there are any, then // dump the text to a file. job.getConfiguration().set(GREP_MATCHES_TO_SEARCH, HBaseConstants.GREP_RESULTS); job.waitForCompletion(true); long filesWritten = job.getCounters().findCounter(WrittenDocumentCount.DOCUMENTS).getValue(); // If we wrote some files, return 0. Else, return 1. return filesWritten != 0 ? true : false; } catch (IOException ex) { LOG.error("IO Exception while writing documents to sequence files.", ex); } catch (DecoderException ex) { LOG.error("Decoder Exception while setting up FsEntryHBaseInputFormat.", ex); } catch (InterruptedException ex) { LOG.error("InterruptedException while running job.", ex); } catch (ClassNotFoundException ex) { LOG.error("ClassNotFoundException while spinning off job.", ex); } return false; }
From source file:org.sleuthkit.hadoop.scoring.CrossImageScorerJob.java
License:Open Source License
public static void runPipeline(String imgDir, String imgID, String friendlyName) { try {//from w w w . j a v a2 s . c o m Path crossImageDir = new Path(imgDir + "/crossimg/data/"); Path scoreDir = new Path(imgDir + "/crossimg/scores/"); Job j = SKJobFactory.createJob(imgID, friendlyName, JobNames.CROSS_IMG_SIM_SCORING); j.setInputFormatClass(TableInputFormat.class); j.setOutputFormatClass(SequenceFileOutputFormat.class); j.setMapperClass(CrossImageScoreMapper.class); j.setReducerClass(CrossImageScoreReducer.class); j.setMapOutputKeyClass(BytesWritable.class); j.setMapOutputValueClass(BytesWritable.class); j.setOutputKeyClass(BytesWritable.class); j.setOutputValueClass(BytesArrayWritable.class); j.setJarByClass(CrossImageScoreMapper.class); SequenceFileOutputFormat.setOutputPath(j, crossImageDir); final Scan scan = new Scan(); // This isn't good (who would ever want to use a regex to check string length?) // However, short of writing an entire input format, this is the best we can do. // It seems to improve performance by >50% with NSRL loaded in, so it's better // than nothing. scan.setFilter(new RowFilter(CompareOp.EQUAL, new RegexStringComparator(".{20,}"))); HBaseConfiguration.addHbaseResources(j.getConfiguration()); j.getConfiguration().set(TableInputFormat.INPUT_TABLE, "hash"); j.getConfiguration().set(TableInputFormat.SCAN, convertScanToString(scan)); j.getConfiguration().set(SKMapper.ID_KEY, imgID); j.waitForCompletion(true); // get the files in this image from the hadoop counter. long filesInImage = j.getCounters().findCounter(FileCount.FILES).getValue(); j = SKJobFactory.createJob(imgID, friendlyName, JobNames.CROSS_IMG_SIM_SCORING_CALC); j.getConfiguration().setLong(IIFScoreReducer.FILES_IN_IMAGE, filesInImage); // TODO: Get the number of images from the images table. This is pretty key for IIF. j.getConfiguration().setLong(IIFScoreMapper.TOTAL_IMAGES, 11); j.setMapperClass(IIFScoreMapper.class); j.setReducerClass(IIFScoreReducer.class); j.setJarByClass(IIFScoreMapper.class); j.setInputFormatClass(SequenceFileInputFormat.class); j.setOutputFormatClass(TextOutputFormat.class); j.setMapOutputKeyClass(BytesWritable.class); j.setMapOutputValueClass(DoubleWritable.class); j.setOutputKeyClass(NullWritable.class); j.setOutputValueClass(Text.class); // Because we're building a json object, we need to have exactly one reducer. j.setNumReduceTasks(1); SequenceFileOutputFormat.setOutputPath(j, scoreDir); SequenceFileInputFormat.setInputPaths(j, crossImageDir); j.waitForCompletion(true); CrossImageJSONOutputBuilder.buildReport(new Path(scoreDir, "part-r-00000"), new Path(imgDir + "/reports/data/crossimg.js")); } catch (IOException ex) { LOG.error("Failure while performing HDFS file IO.", ex); } catch (ClassNotFoundException ex) { LOG.error("Error running job; class not found.", ex); } catch (InterruptedException ex) { LOG.error("Hadoop job interrupted.", ex); } }