Example usage for org.apache.hadoop.mapreduce Job getCounters

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job getCounters.

Prototype

public Counters getCounters() throws IOException

Source Link

Document

Gets the counters for this job.

Usage

From source file:ivory.preprocess.BuildTermDocVectors2.java

License:Apache License

@SuppressWarnings("unchecked")
public int runTool() throws Exception {
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    String collectionName = conf.get(Constants.CollectionName);
    String collectionPath = conf.get(Constants.CollectionPath);
    String inputFormat = conf.get(Constants.InputFormat);
    String tokenizer = conf.get(Constants.Tokenizer);
    String mappingClass = conf.get(Constants.DocnoMappingClass);
    int docnoOffset = conf.getInt(Constants.DocnoOffset, 0);

    LOG.info("PowerTool: BuildTermDocVectors2");
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.CollectionPath, collectionPath));
    LOG.info(String.format(" - %s: %s", Constants.InputFormat, inputFormat));
    LOG.info(String.format(" - %s: %s", Constants.Tokenizer, tokenizer));
    LOG.info(String.format(" - %s: %s", Constants.DocnoMappingClass, mappingClass));
    LOG.info(String.format(" - %s: %s", Constants.DocnoOffset, docnoOffset));

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    Path mappingFile = env.getDocnoMappingData();

    if (!fs.exists(mappingFile)) {
        LOG.error("Error, docno mapping data file " + mappingFile + "doesn't exist!");
        return 0;
    }//from w w  w  .j a v a2  s. c  o  m

    DistributedCache.addCacheFile(mappingFile.toUri(), conf);

    Path outputPath = new Path(env.getTermDocVectorsDirectory());
    if (fs.exists(outputPath)) {
        LOG.info("TermDocVectors already exist: Skipping!");
        return 0;
    }

    env.writeCollectionName(collectionName);
    env.writeCollectionPath(collectionPath);
    env.writeInputFormat(inputFormat);
    env.writeDocnoMappingClass(mappingClass);
    env.writeTokenizerClass(tokenizer);
    env.writeDocnoOffset(docnoOffset);

    Job job1 = new Job(conf, "BuildTermDocVectors2:" + collectionName);
    job1.setJarByClass(BuildTermDocVectors2.class);

    job1.setNumReduceTasks(0);

    FileInputFormat.addInputPaths(job1, collectionPath);
    FileOutputFormat.setOutputPath(job1, outputPath);
    SequenceFileOutputFormat.setOutputCompressionType(job1, SequenceFile.CompressionType.RECORD);

    job1.setInputFormatClass((Class<? extends InputFormat>) Class.forName(inputFormat));
    job1.setOutputFormatClass(SequenceFileOutputFormat.class);

    job1.setMapOutputKeyClass(IntWritable.class);
    job1.setMapOutputValueClass(LazyTermDocVector.class);
    job1.setOutputKeyClass(IntWritable.class);
    job1.setOutputValueClass(LazyTermDocVector.class);

    job1.setMapperClass(MyMapper.class);

    long startTime = System.currentTimeMillis();
    job1.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // write out number of postings
    int collectionDocCount = (int) job1.getCounters().findCounter(Docs.Total).getValue();
    env.writeCollectionDocumentCount(collectionDocCount);

    Path dlFile = env.getDoclengthsData();
    if (fs.exists(dlFile)) {
        LOG.info("DocLength data exists: Skipping!");
        return 0;
    }

    conf.setInt(Constants.CollectionDocumentCount, collectionDocCount);
    conf.set(InputPath, env.getDoclengthsDirectory().toString());
    conf.set(DocLengthDataFile, dlFile.toString());

    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);

    LOG.info("Writing doc length data to " + dlFile + "...");

    Job job2 = new Job(conf, "DocLengthTable2:" + collectionName);
    job2.setJarByClass(BuildTermDocVectors2.class);

    job2.setNumReduceTasks(0);
    job2.setInputFormatClass(NullInputFormat.class);
    job2.setOutputFormatClass(NullOutputFormat.class);
    job2.setMapperClass(DocLengthDataWriterMapper.class);

    startTime = System.currentTimeMillis();
    job2.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    long collectionSumOfDocLengths = job2.getCounters().findCounter(DocLengths.SumOfDocLengths).getValue();
    env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount);

    return 0;
}

From source file:ivory.preprocess.GetTermCount2.java

License:Apache License

public int runTool() throws Exception {
    Configuration conf = getConf();

    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);

    String collectionName = env.readCollectionName();
    String termDocVectorsPath = env.getTermDocVectorsDirectory();
    String termDfCfPath = env.getTermDfCfDirectory();

    if (!fs.exists(new Path(indexPath))) {
        LOG.info("index path doesn't existing: skipping!");
        return 0;
    }//  w  ww.j a v  a 2 s. c om

    LOG.info("PowerTool: GetTermCount2");
    LOG.info(String.format(" - %s: %s", Constants.CollectionName, collectionName));
    LOG.info(String.format(" - %s: %s", Constants.IndexPath, indexPath));
    LOG.info(String.format(" - %s: %s", Constants.NumReduceTasks, reduceTasks));

    Path outputPath = new Path(termDfCfPath);
    if (fs.exists(outputPath)) {
        LOG.info("TermDfCf directory exist: skipping!");
        return 0;
    }

    Job job = new Job(getConf(), "GetTermCount2:" + collectionName);
    job.setJarByClass(GetTermCount2.class);

    job.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(job, new Path(termDocVectorsPath));
    FileOutputFormat.setOutputPath(job, outputPath);

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(SequenceFileOutputFormat.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(PairOfIntLong.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(PairOfIntLong.class);

    job.setMapperClass(MyMapper.class);
    job.setCombinerClass(MyCombiner.class);
    job.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    job.waitForCompletion(true);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    Counters counters = job.getCounters();
    // Write out number of postings. NOTE: this value is not the same as
    // number of postings, because postings for non-English terms are
    // discarded, or as result of df cut.
    env.writeCollectionTermCount((int) counters.findCounter(Statistics.Terms).getValue());

    env.writeCollectionLength(counters.findCounter(Statistics.SumOfDocLengths).getValue());
    return 0;
}

From source file:kogiri.common.report.Report.java

License:Open Source License

private String makeText(Job job) {
    String jobName = job.getJobName();
    String jobID = job.getJobID().toString();
    String jobStatus;/*from   w w w  .  ja v  a  2s. c om*/
    try {
        jobStatus = job.getJobState().name();
    } catch (IOException ex) {
        jobStatus = "Unknown";
    } catch (InterruptedException ex) {
        jobStatus = "Unknown";
    }

    String startTimeStr;
    try {
        startTimeStr = TimeHelper.getTimeString(job.getStartTime());
    } catch (Exception ex) {
        startTimeStr = "Unknown";
    }

    String finishTimeStr;
    try {
        finishTimeStr = TimeHelper.getTimeString(job.getFinishTime());
    } catch (Exception ex) {
        finishTimeStr = "Unknown";
    }

    String timeTakenStr;
    try {
        timeTakenStr = TimeHelper.getDiffTimeString(job.getStartTime(), job.getFinishTime());
    } catch (Exception ex) {
        timeTakenStr = "Unknown";
    }

    String countersStr;
    try {
        countersStr = job.getCounters().toString();
    } catch (Exception ex) {
        countersStr = "Unknown";
    }

    return "Job : " + jobName + "\n" + "JobID : " + jobID + "\n" + "Status : " + jobStatus + "\n"
            + "StartTime : " + startTimeStr + "\n" + "FinishTime : " + finishTimeStr + "\n" + "TimeTaken : "
            + timeTakenStr + "\n\n" + countersStr;
}

From source file:kogiri.mapreduce.preprocess.indexing.stage3.KmerStatisticsBuilder.java

License:Open Source License

private int runJob(PreprocessorConfig ppConfig) throws Exception {
    // check config
    validatePreprocessorConfig(ppConfig);

    // configuration
    Configuration conf = this.getConf();

    // set user configuration
    ppConfig.getClusterConfiguration().configureTo(conf);
    ppConfig.saveTo(conf);//from w  w  w  .  jav  a  2  s . c o  m

    Path[] inputFiles = KmerIndexHelper.getAllKmerIndexIndexFilePath(conf, ppConfig.getKmerIndexPath());

    for (Path inputFile : inputFiles) {
        LOG.info(inputFile);
    }

    boolean job_result = true;
    List<Job> jobs = new ArrayList<Job>();

    for (int round = 0; round < inputFiles.length; round++) {
        Path roundInputFile = inputFiles[round];
        Path[] roundInputKmerIndexPartFiles = KmerIndexHelper.getKmerIndexPartFilePath(conf, roundInputFile);

        Job job = new Job(conf,
                "Kogiri Preprocessor - Computing Kmer Statistics (" + round + " of " + inputFiles.length + ")");
        job.setJarByClass(KmerStatisticsBuilder.class);

        // Mapper
        job.setMapperClass(KmerStatisticsBuilderMapper.class);
        job.setInputFormatClass(SequenceFileInputFormat.class);
        job.setMapOutputKeyClass(NullWritable.class);
        job.setMapOutputValueClass(NullWritable.class);

        // Specify key / value
        job.setOutputKeyClass(NullWritable.class);
        job.setOutputValueClass(NullWritable.class);

        // Inputs
        Path[] kmerIndexPartDataFiles = KmerIndexHelper.getAllKmerIndexPartDataFilePath(conf,
                roundInputKmerIndexPartFiles);
        SequenceFileInputFormat.addInputPaths(job, FileSystemHelper.makeCommaSeparated(kmerIndexPartDataFiles));

        LOG.info("Input file : ");
        LOG.info("> " + roundInputFile.toString());

        // Outputs
        job.setOutputFormatClass(NullOutputFormat.class);

        job.setNumReduceTasks(0);

        // Execute job and return status
        boolean result = job.waitForCompletion(true);

        jobs.add(job);

        // check results
        if (result) {
            CounterGroup uniqueGroup = job.getCounters()
                    .getGroup(KmerStatisticsHelper.getCounterGroupNameUnique());
            CounterGroup totalGroup = job.getCounters()
                    .getGroup(KmerStatisticsHelper.getCounterGroupNameTotal());
            CounterGroup squareGroup = job.getCounters()
                    .getGroup(KmerStatisticsHelper.getCounterGroupNameSquare());
            CounterGroup logTFSquareGroup = job.getCounters()
                    .getGroup(KmerStatisticsHelper.getCounterGroupNameLogTFSquare());

            Iterator<Counter> uniqueIterator = uniqueGroup.iterator();
            while (uniqueIterator.hasNext()) {
                long count = 0;
                long length = 0;
                long square = 0;
                double logTFSquare = 0;
                double real_mean = 0;
                double stddev = 0;
                double tf_cosnorm_base = 0;

                Counter uniqueCounter = uniqueIterator.next();
                Counter totalCounter = totalGroup.findCounter(uniqueCounter.getName());
                Counter squareCounter = squareGroup.findCounter(uniqueCounter.getName());
                Counter logTFSquareCounter = logTFSquareGroup.findCounter(uniqueCounter.getName());

                count = uniqueCounter.getValue();
                length = totalCounter.getValue();
                square = squareCounter.getValue();
                logTFSquare = logTFSquareCounter.getValue() / 1000.0;

                tf_cosnorm_base = Math.sqrt(logTFSquare);

                real_mean = (double) length / (double) count;
                // stddev = sqrt((sum(lengths ^ 2)/count) - (mean ^ 2)
                double mean = Math.pow(real_mean, 2);
                double term = (double) square / (double) count;
                stddev = Math.sqrt(term - mean);

                LOG.info("distinct k-mers " + uniqueCounter.getName() + " : " + count);
                LOG.info("total k-mers " + uniqueCounter.getName() + " : " + length);
                LOG.info("average " + uniqueCounter.getName() + " : " + real_mean);
                LOG.info("std-deviation " + uniqueCounter.getName() + " : " + stddev);
                LOG.info("tf-cos-norm-base " + uniqueCounter.getName() + " : " + tf_cosnorm_base);

                Path outputHadoopPath = new Path(ppConfig.getKmerStatisticsPath(),
                        KmerStatisticsHelper.makeKmerStatisticsFileName(uniqueCounter.getName()));
                FileSystem fs = outputHadoopPath.getFileSystem(conf);

                KmerStatistics statistics = new KmerStatistics();
                statistics.setSampleName(uniqueCounter.getName());
                statistics.setKmerSize(ppConfig.getKmerSize());
                statistics.setUniqueKmers(count);
                statistics.setTotalKmers(length);
                statistics.setAverageFrequency(real_mean);
                statistics.setStdDeviation(stddev);
                statistics.setTFCosineNormBase(tf_cosnorm_base);

                statistics.saveTo(fs, outputHadoopPath);
            }
        }

        if (!result) {
            LOG.error("job failed at round " + round + " of " + inputFiles.length);
            job_result = false;
            break;
        }
    }

    // report
    if (ppConfig.getReportPath() != null && !ppConfig.getReportPath().isEmpty()) {
        Report report = new Report();
        report.addJob(jobs);
        report.writeTo(ppConfig.getReportPath());
    }

    return job_result ? 0 : 1;
}

From source file:libra.preprocess.stage2.KmerIndexBuilder.java

License:Apache License

private int runJob(PreprocessorConfig ppConfig) throws Exception {
    // check config
    validatePreprocessorConfig(ppConfig);

    // configuration
    Configuration conf = this.getConf();

    // set user configuration
    ppConfig.saveTo(conf);/*from  w ww .java2s. c o m*/

    Path[] inputFiles = FileSystemHelper.getAllFastaFilePath(conf, ppConfig.getFastaPath());

    boolean job_result = true;
    List<Job> jobs = new ArrayList<Job>();

    for (int round = 0; round < inputFiles.length; round++) {
        Path roundInputFile = inputFiles[round];
        String roundOutputPath = ppConfig.getKmerIndexPath() + "_round" + round;

        Job job = new Job(conf,
                "Libra Preprocessor - Building Kmer Indexes (" + round + " of " + inputFiles.length + ")");
        job.setJarByClass(KmerIndexBuilder.class);

        // Mapper
        job.setMapperClass(KmerIndexBuilderMapper.class);
        FastaKmerInputFormat.setKmerSize(conf, ppConfig.getKmerSize());
        job.setInputFormatClass(FastaKmerInputFormat.class);
        job.setMapOutputKeyClass(CompressedSequenceWritable.class);
        job.setMapOutputValueClass(IntWritable.class);

        // Combiner
        job.setCombinerClass(KmerIndexBuilderCombiner.class);

        // Partitioner
        job.setPartitionerClass(KmerIndexBuilderPartitioner.class);

        // Reducer
        job.setReducerClass(KmerIndexBuilderReducer.class);

        // Specify key / value
        job.setOutputKeyClass(CompressedSequenceWritable.class);
        job.setOutputValueClass(IntWritable.class);

        // Inputs
        FileInputFormat.addInputPaths(job, roundInputFile.toString());

        LOG.info("Input file : ");
        LOG.info("> " + roundInputFile.toString());

        String histogramFileName = KmerHistogramHelper.makeKmerHistogramFileName(roundInputFile.getName());
        Path histogramPath = new Path(ppConfig.getKmerHistogramPath(), histogramFileName);

        KmerIndexBuilderPartitioner.setHistogramPath(job.getConfiguration(), histogramPath);

        FileOutputFormat.setOutputPath(job, new Path(roundOutputPath));
        job.setOutputFormatClass(MapFileOutputFormat.class);

        // Use many reducers
        int reducers = conf.getInt("mapred.reduce.tasks", 0);
        if (reducers <= 0) {
            int MRNodes = MapReduceClusterHelper.getNodeNum(conf);
            reducers = MRNodes * 2;
            job.setNumReduceTasks(reducers);
        }
        LOG.info("Reducers : " + reducers);

        // Execute job and return status
        boolean result = job.waitForCompletion(true);

        jobs.add(job);

        // commit results
        if (result) {
            commitRoundIndexOutputFiles(roundInputFile, new Path(roundOutputPath),
                    new Path(ppConfig.getKmerIndexPath()), job.getConfiguration(), ppConfig.getKmerSize());

            // create index of index
            createIndexOfIndex(new Path(ppConfig.getKmerIndexPath()), roundInputFile, job.getConfiguration(),
                    ppConfig.getKmerSize());

            // create statistics of index
            createStatisticsOfIndex(new Path(ppConfig.getKmerStatisticsPath()), roundInputFile,
                    job.getConfiguration(), job.getCounters(), ppConfig.getKmerSize());
        }

        if (!result) {
            LOG.error("job failed at round " + round + " of " + inputFiles.length);
            job_result = false;
            break;
        }
    }

    // report
    if (ppConfig.getReportPath() != null && !ppConfig.getReportPath().isEmpty()) {
        Report report = new Report();
        report.addJob(jobs);
        report.writeTo(ppConfig.getReportPath());
    }

    return job_result ? 0 : 1;
}

From source file:ml.shifu.shifu.core.processor.stats.MapReducerStatsWorker.java

License:Apache License

protected void updateBinningInfoWithMRJob() throws IOException, InterruptedException, ClassNotFoundException {
    RawSourceData.SourceType source = this.modelConfig.getDataSet().getSource();

    String filePath = Constants.BINNING_INFO_FILE_NAME;
    BufferedWriter writer = null;
    List<Scanner> scanners = null;
    try {//w w  w.j  av a  2  s .  c  om
        scanners = ShifuFileUtils.getDataScanners(pathFinder.getUpdatedBinningInfoPath(source), source);
        writer = new BufferedWriter(
                new OutputStreamWriter(new FileOutputStream(new File(filePath)), Charset.forName("UTF-8")));
        for (Scanner scanner : scanners) {
            while (scanner.hasNextLine()) {
                String line = scanner.nextLine();
                writer.write(line + "\n");
            }
        }
    } finally {
        // release
        processor.closeScanners(scanners);
        IOUtils.closeQuietly(writer);
    }

    Configuration conf = new Configuration();
    prepareJobConf(source, conf, filePath);

    @SuppressWarnings("deprecation")
    Job job = new Job(conf, "Shifu: Stats Updating Binning Job : " + this.modelConfig.getModelSetName());
    job.setJarByClass(getClass());
    job.setMapperClass(UpdateBinningInfoMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(BinningInfoWritable.class);
    job.setInputFormatClass(CombineInputFormat.class);
    FileInputFormat.setInputPaths(job, ShifuFileUtils.getFileSystemBySourceType(source)
            .makeQualified(new Path(super.modelConfig.getDataSetRawPath())));

    job.setReducerClass(UpdateBinningInfoReducer.class);

    int mapperSize = new CombineInputFormat().getSplits(job).size();
    log.info("DEBUG: Test mapper size is {} ", mapperSize);
    Integer reducerSize = Environment.getInt(CommonConstants.SHIFU_UPDATEBINNING_REDUCER);
    if (reducerSize != null) {
        job.setNumReduceTasks(Environment.getInt(CommonConstants.SHIFU_UPDATEBINNING_REDUCER, 20));
    } else {
        // By average, each reducer handle 100 variables
        int newReducerSize = (this.columnConfigList.size() / 100) + 1;
        // if(newReducerSize < 1) {
        // newReducerSize = 1;
        // }
        // if(newReducerSize > 500) {
        // newReducerSize = 500;
        // }
        log.info("Adjust updating binning info reducer size to {} ", newReducerSize);
        job.setNumReduceTasks(newReducerSize);
    }
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    String preTrainingInfo = this.pathFinder.getPreTrainingStatsPath(source);
    FileOutputFormat.setOutputPath(job, new Path(preTrainingInfo));

    // clean output firstly
    ShifuFileUtils.deleteFile(preTrainingInfo, source);

    // submit job
    if (!job.waitForCompletion(true)) {
        FileUtils.deleteQuietly(new File(filePath));
        throw new RuntimeException("MapReduce Job Updateing Binning Info failed.");
    } else {
        long totalValidCount = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "TOTAL_VALID_COUNT")
                .getValue();
        long invalidTagCount = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "INVALID_TAG")
                .getValue();
        long filterOut = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "FILTER_OUT_COUNT")
                .getValue();
        long weightExceptions = job.getCounters().findCounter(Constants.SHIFU_GROUP_COUNTER, "WEIGHT_EXCEPTION")
                .getValue();
        log.info(
                "Total valid records {}, invalid tag records {}, filter out records {}, weight exception records {}",
                totalValidCount, invalidTagCount, filterOut, weightExceptions);

        if (totalValidCount > 0L && invalidTagCount * 1d / totalValidCount >= 0.8d) {
            log.warn(
                    "Too many invalid tags, please check you configuration on positive tags and negative tags.");
        }
    }
    FileUtils.deleteQuietly(new File(filePath));
}

From source file:mvm.rya.accumulo.mr.fileinput.RdfFileInputByLineTool.java

License:Apache License

public long runJob(String[] args)
        throws IOException, ClassNotFoundException, InterruptedException, AccumuloSecurityException {
    conf.setBoolean("mapred.map.tasks.speculative.execution", false);
    conf.setBoolean("mapred.reduce.tasks.speculative.execution", false);
    conf.set("io.sort.mb", "256");
    conf.setLong("mapred.task.timeout", 600000000);

    zk = conf.get(MRUtils.AC_ZK_PROP, zk);
    instance = conf.get(MRUtils.AC_INSTANCE_PROP, instance);
    userName = conf.get(MRUtils.AC_USERNAME_PROP, userName);
    pwd = conf.get(MRUtils.AC_PWD_PROP, pwd);
    format = RDFFormat.valueOf(conf.get(MRUtils.FORMAT_PROP, RDFFormat.NTRIPLES.toString()));

    String tablePrefix = conf.get(MRUtils.TABLE_PREFIX_PROPERTY, RdfCloudTripleStoreConstants.TBL_PRFX_DEF);

    Job job = new Job(conf);
    job.setJarByClass(RdfFileInputByLineTool.class);

    // set up cloudbase input
    job.setInputFormatClass(TextInputFormat.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));

    // set input output of the particular job
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Mutation.class);

    job.setOutputFormatClass(AccumuloOutputFormat.class);
    AccumuloOutputFormat.setConnectorInfo(job, userName, new PasswordToken(pwd.getBytes()));
    AccumuloOutputFormat.setCreateTables(job, true);
    AccumuloOutputFormat.setDefaultTableName(job, tablePrefix + RdfCloudTripleStoreConstants.TBL_SPO_SUFFIX);
    AccumuloOutputFormat.setZooKeeperInstance(job, instance, zk);

    // set mapper and reducer classes
    job.setMapperClass(TextToMutationMapper.class);
    job.setNumReduceTasks(0);/*w w w .java 2s.c o  m*/

    // Submit the job
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int exitCode = job.waitForCompletion(true) ? 0 : 1;

    if (exitCode == 0) {
        Date end_time = new Date();
        System.out.println("Job ended: " + end_time);
        System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
        return job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS")
                .getValue();
    } else {
        System.out.println("Job Failed!!!");
    }

    return -1;
}

From source file:mvm.rya.accumulo.mr.fileinput.RdfFileInputTool.java

License:Apache License

public long runJob(String[] args)
        throws IOException, ClassNotFoundException, InterruptedException, AccumuloSecurityException {
    conf.set(MRUtils.JOB_NAME_PROP, "Rdf File Input");
    //faster/*  w  w w  .j  a  va  2 s.  c om*/
    init();
    format = conf.get(MRUtils.FORMAT_PROP, format);
    conf.set(MRUtils.FORMAT_PROP, format);

    String inputPath = conf.get(MRUtils.INPUT_PATH, args[0]);

    Job job = new Job(conf);
    job.setJarByClass(RdfFileInputTool.class);

    // set up cloudbase input
    job.setInputFormatClass(RdfFileInputFormat.class);
    RdfFileInputFormat.addInputPath(job, new Path(inputPath));

    // set input output of the particular job
    job.setMapOutputKeyClass(LongWritable.class);
    job.setMapOutputValueClass(RyaStatementWritable.class);

    setupOutputFormat(job, tablePrefix + RdfCloudTripleStoreConstants.TBL_SPO_SUFFIX);

    // set mapper and reducer classes
    job.setMapperClass(StatementToMutationMapper.class);
    job.setNumReduceTasks(0);

    // Submit the job
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int exitCode = job.waitForCompletion(true) ? 0 : 1;

    if (exitCode == 0) {
        Date end_time = new Date();
        System.out.println("Job ended: " + end_time);
        System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
        return job.getCounters().findCounter("org.apache.hadoop.mapred.Task$Counter", "REDUCE_OUTPUT_RECORDS")
                .getValue();
    } else {
        System.out.println("Job Failed!!!");
    }

    return -1;
}

From source file:mvm.rya.accumulo.pig.IndexWritingTool.java

License:Apache License

@Override
public int run(final String[] args) throws Exception {
    Preconditions.checkArgument(args.length == 7, "java " + IndexWritingTool.class.getCanonicalName()
            + " hdfsSaveLocation sparqlFile cbinstance cbzk cbuser cbpassword rdfTablePrefix.");

    final String inputDir = args[0];
    final String sparqlFile = args[1];
    final String instStr = args[2];
    final String zooStr = args[3];
    final String userStr = args[4];
    final String passStr = args[5];
    final String tablePrefix = args[6];

    String sparql = FileUtils.readFileToString(new File(sparqlFile));

    Job job = new Job(getConf(), "Write HDFS Index to Accumulo");
    job.setJarByClass(this.getClass());

    Configuration jobConf = job.getConfiguration();
    jobConf.setBoolean("mapred.map.tasks.speculative.execution", false);
    setVarOrders(sparql, jobConf);//from  ww  w  .  java  2 s  . co  m

    TextInputFormat.setInputPaths(job, inputDir);
    job.setInputFormatClass(TextInputFormat.class);

    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Mutation.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Mutation.class);

    job.setNumReduceTasks(0);

    String tableName;
    if (zooStr.equals("mock")) {
        tableName = tablePrefix;
    } else {
        tableName = tablePrefix + "INDEX_" + UUID.randomUUID().toString().replace("-", "").toUpperCase();
    }
    setAccumuloOutput(instStr, zooStr, userStr, passStr, job, tableName);

    jobConf.set(sparql_key, sparql);

    int complete = job.waitForCompletion(true) ? 0 : -1;

    if (complete == 0) {

        String[] varOrders = jobConf.getStrings("varOrders");
        String orders = Joiner.on("\u0000").join(varOrders);
        Instance inst;

        if (zooStr.equals("mock")) {
            inst = new MockInstance(instStr);
        } else {
            inst = new ZooKeeperInstance(instStr, zooStr);
        }

        Connector conn = inst.getConnector(userStr, passStr.getBytes());
        BatchWriter bw = conn.createBatchWriter(tableName, 10, 5000, 1);

        Counters counters = job.getCounters();
        Counter c1 = counters.findCounter(cardCounter, cardCounter);

        Mutation m = new Mutation("~SPARQL");
        Value v = new Value(sparql.getBytes());
        m.put(new Text("" + c1.getValue()), new Text(orders), v);
        bw.addMutation(m);

        bw.close();

        return complete;
    } else {
        return complete;
    }

}

From source file:net.broomie.JpWordCounter.java

License:Apache License

/**
 * This method is implement for creating the dfdb with MapReduce.
 * @param conf Specify the conf object, which is hadoop Configuration.
 * @param dfdb Specify the dfdb directory path on HDFS.
 * @return Return `true' if success, return `false' if fail.
 * @throws IOException Exception for a input file IO.
 * @throws InterruptedException Exception for return waitForCompletion().
 * @throws ClassNotFoundException Exception for Mapper and Reduce class.
 * @throws URISyntaxException Exception for new URI().
 * The dfdb means `document frequency'./*w w w  . ja  v  a  2 s.co  m*/
 */
private boolean runCreateDFDB(Configuration conf, String dfdb)
        throws IOException, InterruptedException, ClassNotFoundException, URISyntaxException {
    String reducerNum = conf.get(WORD_COUNTER_REDUCER_NUM);
    Job job = new Job(conf);
    job.setJarByClass(JpWordCounter.class);
    TextInputFormat.addInputPath(job, new Path(in));
    FileSystem fs = FileSystem.get(new URI(dfdb), conf);
    FileStatus[] status = fs.listStatus(new Path(dfdb));
    if (status != null) {
        fs.delete(new Path(dfdb), true);
    }
    fs.close();
    FileOutputFormat.setOutputPath(job, new Path(dfdb));
    job.setMapperClass(DFMapper.class);
    job.setReducerClass(TokenizeReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setNumReduceTasks(Integer.valueOf(reducerNum));
    boolean rv = job.waitForCompletion(true);

    if (rv) {
        Counters counters = job.getCounters();
        long inputNum = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", "MAP_INPUT_RECORDS")
                .getValue();
        FileSystem hdfs = FileSystem.get(conf);
        String numLinePath = conf.get(PROP_LINE_NUM);
        FSDataOutputStream stream = hdfs.create(new Path(numLinePath));
        stream.writeUTF(String.valueOf((int) inputNum));
        stream.close();
    }

    return rv;
}