Example usage for org.apache.hadoop.mapreduce Job getCounters

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getCounters.

Prototype

public Counters getCounters() throws IOException

Source Link

Document

Gets the counters for this job.

Usage

From source file:org.lilyproject.mapreduce.test.MapReduceTest.java

License:Apache License

private long getTotalInputRecords(Job job) throws IOException {
    return job.getCounters().findCounter("org.apache.hadoop.mapreduce.TaskCounter", "MAP_INPUT_RECORDS")
            .getValue();/*from   w ww.ja v  a  2s.c  o m*/
}

From source file:org.oclc.firefly.hadoop.backup.Backup.java

License:Apache License

/**
 * Performs a complete copy of the source hbase to the given destination
 * @param tables The names of the tables to backup
 * @param maxTries The maximum number of times to try to copy regions.
 * @return True if successful, false otherwise
 * @throws IOException If failed to interact with Hadoop
 * @throws ClassNotFoundException //from  w w  w.  j a  va 2 s  .co  m
 * @throws InterruptedException 
 */
public boolean doMajorCopy(String[] tables, int maxTries)
        throws IOException, InterruptedException, ClassNotFoundException {
    boolean ret = false;
    String username = getUsername();
    short replication = (short) getInitialReplication();

    // Get a list of regions from HBase
    // Then filter out the regions we are not extracting, and group them by table
    List<CatalogRow> regions = getHBaseRegions(srcConf);
    Map<String, List<CatalogRow>> filtered = groupAndFilter(regions, tables);
    List<Pair<String, HRegionInfo>> mapperInput = new ArrayList<Pair<String, HRegionInfo>>();

    // Prepare the input for the mappers to use
    // This creates a list of region server and region pairs
    LOG.info("Exporting the following tables:");
    for (Entry<String, List<CatalogRow>> entry : filtered.entrySet()) {
        String tablename = entry.getKey();
        List<CatalogRow> rows = entry.getValue();

        LOG.info(". " + tablename);

        for (CatalogRow r : rows) {
            String regionServer = r.getHost() + ":" + r.getPort();
            HRegionInfo region = r.getHRegionInfo();
            mapperInput.add(Pair.newPair(regionServer, region));
        }
    }

    // Make sure we write to a directory that does not exist
    backupDirectoryPath = createBackupDirectory(getCurrentDateString());
    LOG.info("Starting backup path: " + backupDirectoryPath);

    // Copy the .tableinfo files for the tables we are extracting
    // These files are not copied by the MR job as it only focuses on regions
    List<FileStatus> tableInfoFiles = getTableInfoFiles(srcFs, filtered);
    for (FileStatus file : tableInfoFiles) {
        Path srcFilePath = file.getPath();
        Path relPath = new Path(BackupUtils.getFsRelativePath(srcFs, srcFilePath));
        Path dstFilePath = new Path(backupDirectoryPath.toString() + relPath.toString());
        BackupUtils.copy(srcFs, srcFilePath, dstFs, dstFilePath, buffer, username, replication);
    }

    // Dispatch MR job and monitor
    // Retry regions if necessary
    if (mapperInput.size() > 0) {
        int tries = 0;

        while (!ret && (maxTries == 0 || tries < maxTries)) {
            if (getNumMapTasks() > mapperInput.size()) {
                setNumMapTasks(mapperInput.size());
                LOG.info("Not enough regions. Reducing number of map tasks");
            }

            // Generate a list of mapper input files and create job
            List<Path> sourceFiles = createMapperInputSequenceFiles(mapperInput, getNumMapTasks(), srcFs,
                    tries);
            Job job = createMRJob(srcConf, dstConf, sourceFiles, backupDirectoryPath, tries);

            LOG.info(job.getJobName());
            LOG.info("--------------------------------------------------");
            LOG.info("Number of regions  : " + mapperInput.size());
            LOG.info("Number of map tasks: " + getNumMapTasks());
            LOG.info("Mapper input path  : " + getMapInputDirectory(tries));
            LOG.info("Mapper output path : " + FileOutputFormat.getOutputPath(job));
            LOG.info("--------------------------------------------------");

            job.waitForCompletion(true);
            if (job.isSuccessful()) {
                // Check if any regions failed
                Counters counters = job.getCounters();
                Counter failedCounter = counters.findCounter("Backup", "FailedRegions");
                long failed = failedCounter.getValue();

                if (failed > 0) {
                    LOG.info("Number of failed regions: " + failed + ".");

                    // get a fresh list of regions to copy
                    List<Pair<String, HRegionInfo>> failedRegions = getFailedRegions(srcFs, srcConf, tries);
                    addCopiedRegions(mapperInput, failedRegions);
                    mapperInput = getRemainingRegions(mapperInput, tables);

                    for (Pair<String, HRegionInfo> pair : mapperInput) {
                        LOG.info("Retry: " + pair.getSecond());
                    }

                    if (mapperInput.size() == 0) {
                        ret = true;
                        backupDirectoryPath = appendEndTime(backupDirectoryPath);

                        LOG.warn("No regions left to copy, but expected to copy more. "
                                + "Please inspect logs/files manually for errors");
                    }
                } else {
                    ret = true;

                    addCopiedRegions(mapperInput, null);
                    backupDirectoryPath = appendEndTime(backupDirectoryPath);
                    LOG.info("MR job finished successfully");
                }
            } else {
                LOG.error("An unexpected error occurred during the MR job. Please see MR logs.");
                break;
            }

            tries++;
        }

        if (ret) {
            if (verifyCopiedRegions()) {
                LOG.info("Verification passed succesfully");
            } else {
                ret = false;
                LOG.info("Verification failed. Please inspect errors manually");
            }
        } else {
            LOG.info("No attempts left. Try setting -n to a higher value, or setting it to 0");
        }
    }

    if (ret) {
        // Set replication factor of backup directory to default.
        // This may not be the best solution, but let built-in shell take care of it
        // because it can do it recursively with out us having to rediscover all the files
        short finalReplication = (short) getFinalReplication();

        if (replication != finalReplication) {
            FsShell shell = new FsShell(dstConf);
            String[] repArgs = { "-setrep", "-R", "-w", "" + finalReplication, backupDirectoryPath.toString() };

            try {
                LOG.info("Setting final replication factor of backup files to " + finalReplication);
                shell.run(repArgs);
            } catch (Exception e) {
                LOG.warn("Could not set replication factor of backup files to " + finalReplication);
            }
        }
    }

    return ret;
}

From source file:org.opencloudengine.flamingo.mapreduce.util.CounterUtils.java

License:Apache License

/**
 * Job? ??   Counter Map ? ./*from  ww  w.  j  a  v  a2s.c  om*/
 * Key? <tt>GROUP_COUNTER</tt> ?   Group Name?
 * <tt>CLEAN</tt>?, Counter <tt>VALID</tt>?  Key
 * <tt>CLEAN_VALID</tt> ?.
 *
 * @param job Hadoop Job
 * @return Counter ? ? Map
 */
public static Map<String, String> getCounters(Job job) {
    Map<String, String> resultMap = new HashMap<String, String>();
    try {
        Counters counters = job.getCounters();
        Iterable<String> groupNames = counters.getGroupNames();
        Iterator<String> groupIterator = groupNames.iterator();
        while (groupIterator.hasNext()) {
            String groupName = groupIterator.next();
            CounterGroup group = counters.getGroup(groupName);
            Iterator<Counter> counterIterator = group.iterator();
            while (counterIterator.hasNext()) {
                Counter counter = counterIterator.next();
                logger.info("[{}] {} = {}",
                        new Object[] { group.getName(), counter.getName(), counter.getValue() });
                String realName = HadoopMetrics.getMetricName(group.getName() + "_" + counter.getName());
                if (!StringUtils.isEmpty(realName)) {
                    resultMap.put(realName, String.valueOf(counter.getValue()));
                }
            }
        }
    } catch (Exception ex) {
    }
    return resultMap;
}

From source file:org.openflamingo.mapreduce.util.CounterUtils.java

License:Apache License

/**
 * Job? ??   Counter Map ? .//from w  w  w .j a  v a2s. c om
 * Key? <tt>GROUP_COUNTER</tt> ?   Group Name?
 * <tt>CLEAN</tt>?, Counter <tt>VALID</tt>?  Key
 * <tt>CLEAN_VALID</tt> ?.
 *
 * @param job Hadoop Job
 * @return Counter ? ? Map
 */
public static Map<String, String> getCounters(Job job) {
    Map<String, String> resultMap = new HashMap<String, String>();
    try {
        Counters counters = job.getCounters();
        Collection<String> groupNames = counters.getGroupNames();
        Iterator<String> groupIterator = groupNames.iterator();
        while (groupIterator.hasNext()) {
            String groupName = groupIterator.next();
            CounterGroup group = counters.getGroup(groupName);
            Iterator<Counter> counterIterator = group.iterator();
            while (counterIterator.hasNext()) {
                Counter counter = counterIterator.next();
                logger.info("[{}] {} = {}",
                        new Object[] { group.getName(), counter.getName(), counter.getValue() });
                String realName = HadoopMetrics.getMetricName(group.getName() + "_" + counter.getName());
                if (!StringUtils.isEmpty(realName)) {
                    resultMap.put(realName, String.valueOf(counter.getValue()));
                }
            }
        }
    } catch (Exception ex) {
    }
    return resultMap;
}

From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java

License:Open Source License

protected boolean runDictionaryJob()
        throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException {
    boolean jobOK;
    Job job = null;
    BufferedWriter bufferedWriter;

    // if output path exists...
    if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) {
        if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively
            this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath());
            System.out.println("Select other path or use option -dd to overwrite");
            System.exit(-1);//from ww w  . jav a  2  s . co  m
        }
    }

    // Sample the SequenceInputFormat to do TotalSort and create final output
    job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName() + " phase 2");

    job.setJarByClass(HDTBuilderDriver.class);

    System.out.println("samples = " + this.conf.getDictionarySamplesPath());
    System.out.println("output = " + this.conf.getDictionaryOutputPath());

    FileInputFormat.addInputPath(job, this.conf.getDictionarySamplesPath());
    FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath());

    job.setInputFormatClass(SequenceFileInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    // Identity Mapper
    // job.setMapperClass(Mapper.class);
    job.setCombinerClass(DictionaryCombiner.class);
    job.setPartitionerClass(TotalOrderPartitioner.class);
    job.setReducerClass(DictionaryReducer.class);

    job.setNumReduceTasks(this.conf.getDictionaryReducers());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    System.out.println("Sampling started");
    InputSampler.writePartitionFile(job,
            new InputSampler.IntervalSampler<Text, Text>(this.conf.getSampleProbability()));
    String partitionFile = TotalOrderPartitioner.getPartitionFile(job.getConfiguration());
    URI partitionUri = new URI(partitionFile + "#" + TotalOrderPartitioner.DEFAULT_PATH);
    DistributedCache.addCacheFile(partitionUri, job.getConfiguration());
    DistributedCache.createSymlink(job.getConfiguration());
    System.out.println("Sampling finished");

    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    jobOK = job.waitForCompletion(true);

    this.numShared = job.getCounters().findCounter(Counters.Shared).getValue();
    this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue();
    this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue();
    this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue();

    bufferedWriter = new BufferedWriter(
            new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile())));

    bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n");

    bufferedWriter.close();

    return jobOK;
}

From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java

License:Open Source License

protected boolean runDictionaryJobWithOneJob()
        throws ClassNotFoundException, IOException, InterruptedException, URISyntaxException {
    boolean jobOK;
    Job job = null;
    BufferedWriter bufferedWriter;

    // if input path does not exists, fail
    if (!this.inputFS.exists(this.conf.getInputPath())) {
        System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
        System.exit(-1);//from   w w w.  ja v  a  2 s . c om
    }

    // if output path exists...
    if (this.dictionaryFS.exists(this.conf.getDictionaryOutputPath())) {
        if (this.conf.getDeleteDictionaryOutputPath()) { // ... and option provided, delete recursively
            this.dictionaryFS.delete(this.conf.getDictionaryOutputPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Dictionary output path does exist: " + this.conf.getDictionaryOutputPath());
            System.out.println("Select other path or use option -dd to overwrite");
            System.exit(-1);
        }
    }

    // Launch job
    job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName());
    job.setJarByClass(HDTBuilderDriver.class);

    FileInputFormat.addInputPath(job, this.conf.getInputPath());
    FileOutputFormat.setOutputPath(job, this.conf.getDictionaryOutputPath());

    job.setInputFormatClass(LzoTextInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    job.setMapperClass(DictionaryMapper.class);
    job.setCombinerClass(DictionaryCombiner.class);
    job.setReducerClass(DictionaryReducer.class);

    job.setNumReduceTasks(this.conf.getDictionaryReducers());

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SHARED, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.SUBJECTS, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.PREDICATES, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);
    MultipleOutputs.addNamedOutput(job, HDTBuilderConfiguration.OBJECTS, SequenceFileOutputFormat.class,
            Text.class, NullWritable.class);

    jobOK = job.waitForCompletion(true);

    this.numShared = job.getCounters().findCounter(Counters.Shared).getValue();
    this.numSubjects = job.getCounters().findCounter(Counters.Subjects).getValue();
    this.numPredicates = job.getCounters().findCounter(Counters.Predicates).getValue();
    this.numObjects = job.getCounters().findCounter(Counters.Objects).getValue();

    bufferedWriter = new BufferedWriter(
            new OutputStreamWriter(this.dictionaryFS.create(this.conf.getDictionaryCountersFile())));

    bufferedWriter.write(HDTBuilderConfiguration.SHARED + "=" + this.numShared + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.SUBJECTS + "=" + this.numSubjects + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.PREDICATES + "=" + this.numPredicates + "\n");
    bufferedWriter.write(HDTBuilderConfiguration.OBJECTS + "=" + this.numObjects + "\n");

    bufferedWriter.close();

    return jobOK;
}

From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java

License:Open Source License

protected boolean runTriplesJobSampling() throws ClassNotFoundException, IOException, InterruptedException {
    Job job = null;
    boolean jobOK;
    BufferedWriter bufferedWriter;

    // if input path does not exists, fail
    if (!this.inputFS.exists(this.conf.getInputPath())) {
        System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
        System.exit(-1);/*from   w w w . j  a v  a2 s .  c o  m*/
    }

    // if dictionary output path does not exists, fail
    if (!this.dictionaryFS.exists(this.conf.getInputPath())) {
        System.out.println("Dictionary output path does not exist: " + this.conf.getInputPath());
        System.exit(-1);
    }

    // if samples path exists, fail
    if (this.dictionaryFS.exists(this.conf.getTriplesSamplesPath())) {
        if (this.conf.getDeleteTriplesSamplesPath()) { // ... and option
            // provided, delete
            // recursively
            this.dictionaryFS.delete(this.conf.getTriplesSamplesPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Triples samples path does exist: " + this.conf.getTriplesSamplesPath());
            System.out.println("Select other path or use option -dst to overwrite");
            System.exit(-1);
        }
    }

    this.conf.setProperty("mapred.child.java.opts",
            "-XX:ErrorFile=/home/hadoop/tmp/hs_err_pid%p.log -Xmx2500m");

    // Job to create a SequenceInputFormat
    job = new Job(this.conf.getConfigurationObject(), this.conf.getTriplesJobName() + " phase 1");

    job.setJarByClass(HDTBuilderDriver.class);

    FileInputFormat.addInputPath(job, this.conf.getInputPath());
    FileOutputFormat.setOutputPath(job, this.conf.getTriplesSamplesPath());

    job.setInputFormatClass(LzoTextInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    job.setMapperClass(TriplesSPOMapper.class);
    job.setSortComparatorClass(TripleSPOComparator.class);
    job.setGroupingComparatorClass(TripleSPOComparator.class);
    job.setMapOutputKeyClass(TripleSPOWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    job.setOutputKeyClass(TripleSPOWritable.class);
    job.setOutputValueClass(NullWritable.class);

    job.setNumReduceTasks(this.conf.getTriplesReducers());

    DistributedCache.addCacheFile(this.conf.getDictionaryFile().toUri(), job.getConfiguration());

    SequenceFileOutputFormat.setCompressOutput(job, true);
    SequenceFileOutputFormat.setOutputCompressorClass(job, com.hadoop.compression.lzo.LzoCodec.class);
    SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK);

    jobOK = job.waitForCompletion(true);

    this.numTriples = job.getCounters().findCounter(Counters.Triples).getValue();
    bufferedWriter = new BufferedWriter(
            new OutputStreamWriter(this.triplesFS.create(this.conf.getTriplesCountersFile())));
    bufferedWriter.write(this.numTriples.toString() + "\n");
    bufferedWriter.close();

    return jobOK;
}

From source file:org.rdfhdt.mrbuilder.HDTBuilderDriver.java

License:Open Source License

protected boolean runTriplesJobWithOneJob()
        throws IOException, ClassNotFoundException, InterruptedException, URISyntaxException {
    Job job = null;
    boolean jobOK;
    BufferedWriter bufferedWriter;

    // if input path does not exists, fail
    if (!this.inputFS.exists(this.conf.getInputPath())) {
        System.out.println("Dictionary input path does not exist: " + this.conf.getInputPath());
        System.exit(-1);//from  ww w.  ja va  2 s. c o m
    }

    // if dictionary output path does not exists, fail
    if (!this.dictionaryFS.exists(this.conf.getInputPath())) {
        System.out.println("Dictionary output path does not exist: " + this.conf.getInputPath());
        System.exit(-1);
    }

    // if triples output path exists...
    if (this.triplesFS.exists(this.conf.getTriplesOutputPath())) {
        if (this.conf.getDeleteTriplesOutputPath()) { // ... and option provided, delete recursively
            this.triplesFS.delete(this.conf.getTriplesOutputPath(), true);
        } else { // ... and option not provided, fail
            System.out.println("Triples output path does exist: " + this.conf.getTriplesOutputPath());
            System.out.println("Select other path or use option -dt to overwrite");
            System.exit(-1);
        }
    }

    // Launch job
    this.conf.setProperty("mapred.child.java.opts",
            "-XX:ErrorFile=/home/hadoop/tmp/hs_err_pid%p.log -Xmx2500m");

    job = new Job(this.conf.getConfigurationObject(), this.conf.getDictionaryJobName());
    job.setJarByClass(HDTBuilderDriver.class);

    FileInputFormat.addInputPath(job, this.conf.getInputPath());
    FileOutputFormat.setOutputPath(job, this.conf.getTriplesOutputPath());

    job.setInputFormatClass(LzoTextInputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);

    job.setMapperClass(TriplesSPOMapper.class);
    job.setSortComparatorClass(TripleSPOComparator.class);
    job.setMapOutputKeyClass(TripleSPOWritable.class);
    job.setMapOutputValueClass(NullWritable.class);

    job.setNumReduceTasks(this.conf.getTriplesReducers());

    job.setOutputKeyClass(TripleSPOWritable.class);
    job.setOutputValueClass(NullWritable.class);

    DistributedCache.addCacheFile(this.conf.getDictionaryFile().toUri(), job.getConfiguration());
    // DistributedCache.addCacheFile(this.conf.getDictionaryMapFile().toUri(), job.getConfiguration());
    // DistributedCache.addCacheFile(this.conf.getDictionaryReduceFile().toUri(), job.getConfiguration());

    jobOK = job.waitForCompletion(true);

    this.numTriples = job.getCounters().findCounter(Counters.Triples).getValue();
    bufferedWriter = new BufferedWriter(
            new OutputStreamWriter(this.triplesFS.create(this.conf.getTriplesCountersFile())));
    bufferedWriter.write(this.numTriples.toString() + "\n");
    bufferedWriter.close();

    return jobOK;
}

From source file:org.sleuthkit.hadoop.clustering.SequenceFsEntryText.java

License:Open Source License

/** Runs a mapreduce task which will iterate over the HBase entries table
 * using FSEntry. It will output files on the hdd with the identifier
 * id that have grep matches to one or more sequence files in outDir.
 *//*from  ww w  .ja va  2  s.  co  m*/
public static boolean runPipeline(String outDir, String id, String friendlyName) {
    try {
        Job job = SKJobFactory.createJob(id, friendlyName, JobNames.GREP_MATCHED_FILES_OUT);
        job.setJarByClass(SequenceFsEntryText.class);
        job.setMapperClass(SequenceFsEntryTextMapper.class);

        // We don't need a combiner or a reducer for this job. We aren't
        // writing anything out either.
        job.setNumReduceTasks(0);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        job.setInputFormatClass(FsEntryHBaseInputFormat.class);
        FsEntryHBaseInputFormat.setupJob(job, id);

        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputPath(job, new Path(outDir));

        // we want to search the default grep results. If there are any, then
        // dump the text to a file.
        job.getConfiguration().set(GREP_MATCHES_TO_SEARCH, HBaseConstants.GREP_RESULTS);

        job.waitForCompletion(true);

        long filesWritten = job.getCounters().findCounter(WrittenDocumentCount.DOCUMENTS).getValue();

        // If we wrote some files, return 0. Else, return 1. 
        return filesWritten != 0 ? true : false;
    } catch (IOException ex) {
        LOG.error("IO Exception while writing documents to sequence files.", ex);
    } catch (DecoderException ex) {
        LOG.error("Decoder Exception while setting up FsEntryHBaseInputFormat.", ex);
    } catch (InterruptedException ex) {
        LOG.error("InterruptedException while running job.", ex);
    } catch (ClassNotFoundException ex) {
        LOG.error("ClassNotFoundException while spinning off job.", ex);
    }
    return false;
}

From source file:org.sleuthkit.hadoop.scoring.CrossImageScorerJob.java

License:Open Source License

public static void runPipeline(String imgDir, String imgID, String friendlyName) {
    try {//from  w  w w .  j a v a2 s  .  c  o  m
        Path crossImageDir = new Path(imgDir + "/crossimg/data/");
        Path scoreDir = new Path(imgDir + "/crossimg/scores/");

        Job j = SKJobFactory.createJob(imgID, friendlyName, JobNames.CROSS_IMG_SIM_SCORING);
        j.setInputFormatClass(TableInputFormat.class);
        j.setOutputFormatClass(SequenceFileOutputFormat.class);
        j.setMapperClass(CrossImageScoreMapper.class);
        j.setReducerClass(CrossImageScoreReducer.class);

        j.setMapOutputKeyClass(BytesWritable.class);
        j.setMapOutputValueClass(BytesWritable.class);

        j.setOutputKeyClass(BytesWritable.class);
        j.setOutputValueClass(BytesArrayWritable.class);

        j.setJarByClass(CrossImageScoreMapper.class);
        SequenceFileOutputFormat.setOutputPath(j, crossImageDir);

        final Scan scan = new Scan();

        // This isn't good (who would ever want to use a regex to check string length?)
        // However, short of writing an entire input format, this is the best we can do.
        // It seems to improve performance by >50% with NSRL loaded in, so it's better
        // than nothing.
        scan.setFilter(new RowFilter(CompareOp.EQUAL, new RegexStringComparator(".{20,}")));

        HBaseConfiguration.addHbaseResources(j.getConfiguration());

        j.getConfiguration().set(TableInputFormat.INPUT_TABLE, "hash");
        j.getConfiguration().set(TableInputFormat.SCAN, convertScanToString(scan));
        j.getConfiguration().set(SKMapper.ID_KEY, imgID);

        j.waitForCompletion(true);

        // get the files in this image from the hadoop counter.
        long filesInImage = j.getCounters().findCounter(FileCount.FILES).getValue();

        j = SKJobFactory.createJob(imgID, friendlyName, JobNames.CROSS_IMG_SIM_SCORING_CALC);
        j.getConfiguration().setLong(IIFScoreReducer.FILES_IN_IMAGE, filesInImage);
        // TODO: Get the number of images from the images table. This is pretty key for IIF.
        j.getConfiguration().setLong(IIFScoreMapper.TOTAL_IMAGES, 11);

        j.setMapperClass(IIFScoreMapper.class);
        j.setReducerClass(IIFScoreReducer.class);
        j.setJarByClass(IIFScoreMapper.class);

        j.setInputFormatClass(SequenceFileInputFormat.class);
        j.setOutputFormatClass(TextOutputFormat.class);

        j.setMapOutputKeyClass(BytesWritable.class);
        j.setMapOutputValueClass(DoubleWritable.class);

        j.setOutputKeyClass(NullWritable.class);
        j.setOutputValueClass(Text.class);
        // Because we're building a json object, we need to have exactly one reducer.
        j.setNumReduceTasks(1);

        SequenceFileOutputFormat.setOutputPath(j, scoreDir);
        SequenceFileInputFormat.setInputPaths(j, crossImageDir);

        j.waitForCompletion(true);

        CrossImageJSONOutputBuilder.buildReport(new Path(scoreDir, "part-r-00000"),
                new Path(imgDir + "/reports/data/crossimg.js"));
    } catch (IOException ex) {
        LOG.error("Failure while performing HDFS file IO.", ex);
    } catch (ClassNotFoundException ex) {
        LOG.error("Error running job; class not found.", ex);
    } catch (InterruptedException ex) {
        LOG.error("Hadoop job interrupted.", ex);
    }

}