Example usage for org.apache.hadoop.mapreduce Job addCacheFile

List of usage examples for org.apache.hadoop.mapreduce Job addCacheFile

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job addCacheFile.

Prototype

public void addCacheFile(URI uri) 

Source Link

Document

Add a file to be localized

Usage

From source file:eu.edisonproject.classification.tfidf.mapreduce.WordFrequencyInDocDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    //        itemset = new LinkedList<String>();
    //        BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(args[2])));
    //        String line;
    //        while ((line = br.readLine()) != null) {
    //            String[] components = line.split("/");
    //            itemset.add(components[0]);
    //        }/*from w  w w.j  a v a2  s .c om*/
    Configuration conf = getConf();

    Job job = Job.getInstance(conf);
    job.setJarByClass(WordFrequencyInDocDriver.class);
    job.setJobName("Word Frequency In Doc Driver");

    FileSystem fs = FileSystem.get(conf);
    fs.delete(new Path(args[1]), true);
    Path in = new Path(args[0]);
    Path inHdfs = in;

    Path dictionaryLocal = new Path(args[2]);
    Path dictionaryHDFS = dictionaryLocal;

    Path stopwordsLocal = new Path(args[3]);
    Path stopwordsHDFS = stopwordsLocal;

    if (!conf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) {
        inHdfs = new Path(in.getName());
        fs.delete(inHdfs, true);
        fs.copyFromLocalFile(in, inHdfs);
        fs.deleteOnExit(inHdfs);

        dictionaryHDFS = new Path(dictionaryLocal.getName());
        if (!fs.exists(dictionaryHDFS)) {
            fs.copyFromLocalFile(dictionaryLocal, dictionaryHDFS);
        }
        stopwordsHDFS = new Path(stopwordsLocal.getName());
        if (!fs.exists(stopwordsHDFS)) {
            fs.copyFromLocalFile(stopwordsLocal, stopwordsHDFS);
        }
    }

    FileStatus dictionaryStatus = fs.getFileStatus(dictionaryHDFS);
    dictionaryHDFS = dictionaryStatus.getPath();
    job.addCacheFile(dictionaryHDFS.toUri());

    FileStatus stopwordsStatus = fs.getFileStatus(stopwordsHDFS);
    stopwordsHDFS = stopwordsStatus.getPath();
    job.addCacheFile(stopwordsHDFS.toUri());

    FileInputFormat.setInputPaths(job, inHdfs);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    job.setInputFormatClass(AvroKeyInputFormat.class);
    job.setMapperClass(WordFrequencyInDocMapper.class);
    AvroJob.setInputKeySchema(job, Document.getClassSchema());
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Integer.class);
    job.setReducerClass(WordFrequencyInDocReducer.class);
    return (job.waitForCompletion(true) ? 0 : 1);
}

From source file:eu.edisonproject.training.tfidf.mapreduce.TermWordFrequency.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration jobconf = getConf();

    FileSystem fs = FileSystem.get(jobconf);
    fs.delete(new Path(args[1]), true);
    Path in = new Path(args[0]);
    Path inHdfs = in;/*from w  w  w . j av a2  s. c o  m*/
    if (!jobconf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) {
        inHdfs = new Path(in.getName());
        fs.delete(inHdfs, true);
        fs.copyFromLocalFile(in, inHdfs);
        fs.deleteOnExit(inHdfs);
        FileStatus inHdfsStatus = fs.getFileStatus(inHdfs);
        //            Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Copied: {0} to: {1}", new Object[]{in.toUri(), inHdfsStatus.getPath().toUri()});
    }

    Job job = Job.getInstance(jobconf);
    Path stopwordsLocal = new Path(args[3]);
    stopwords = new Path(stopwordsLocal.getName());
    fs.delete(stopwords, true);
    fs.copyFromLocalFile(stopwordsLocal, stopwords);
    fs.deleteOnExit(stopwords);

    FileStatus stopwordsStatus = fs.getFileStatus(stopwords);
    stopwords = stopwordsStatus.getPath();
    job.addCacheFile(stopwords.toUri());

    Path localDocs = new Path(args[2]);
    Path hdfsDocs = new Path(localDocs.getName());
    fs.mkdirs(hdfsDocs);
    hdfsDocs = fs.getFileStatus(hdfsDocs).getPath();
    fs.delete(hdfsDocs, true);
    //        FileStatus[] stats = fs.listStatus(localDocs);
    File[] stats = new File(localDocs.toString()).listFiles();

    for (File stat : stats) {
        //        for (FileStatus stat : stats) {
        Path filePath = new Path(stat.getAbsolutePath());
        if (FilenameUtils.getExtension(filePath.getName()).endsWith("txt")) {
            Path dest = new Path(hdfsDocs.toUri() + "/" + filePath.getName());
            fs.copyFromLocalFile(filePath, dest);
        }
    }

    job.addCacheFile(hdfsDocs.toUri());

    job.setJarByClass(TermWordFrequency.class);
    job.setJobName("Word Frequency Term Driver");

    FileInputFormat.setInputPaths(job, inHdfs);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    //         job.setInputFormatClass(TextInputFormat.class);
    job.setInputFormatClass(NLineInputFormat.class);
    NLineInputFormat.addInputPath(job, inHdfs);
    NLineInputFormat.setNumLinesPerSplit(job, Integer.valueOf(args[4]));
    NLineInputFormat.setMaxInputSplitSize(job, 500);
    Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Num. of lines: {0}",
            NLineInputFormat.getNumLinesPerSplit(job));

    job.setMapperClass(TermWordFrequencyMapper.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Integer.class);
    job.setReducerClass(TermWordFrequencyReducer.class);

    return (job.waitForCompletion(true) ? 0 : 1);

}

From source file:fr.ens.biologie.genomique.eoulsan.modules.expression.hadoop.ExpressionHadoopModule.java

License:LGPL

/**
   * Create JobConf object for HTSeq-count.
   * @param context the task context/* w  ww  .ja  v  a  2s . com*/
   * @param alignmentsData alignment data
   * @param featureAnnotationData feature annotations data
   * @param gtfFormat true if the annotation file is in GTF format
   * @param genomeDescriptionData genome description data
   * @param genomicType genomic type
   * @param attributeId attributeId
   * @param splitAttributeValues split attribute values
   * @param stranded stranded mode
   * @param overlapMode overlap mode
   * @param removeAmbiguousCases true to remove ambiguous cases
   * @throws IOException if an error occurs while creating job
   * @throws BadBioEntryException if an entry of the annotation file is invalid
   * @throws EoulsanException if the job creating fails
   */
  private static Job createJobHTSeqCounter(final Configuration parentConf, final TaskContext context,
          final Data alignmentsData, final Data featureAnnotationData, final boolean gtfFormat,
          final Data genomeDescriptionData, final Data outData, final String genomicType,
          final String attributeId, final boolean splitAttributeValues, final StrandUsage stranded,
          final OverlapMode overlapMode, final boolean removeAmbiguousCases, final boolean tsamFormat)
          throws IOException, BadBioEntryException, EoulsanException {

      final Configuration jobConf = new Configuration(parentConf);

      // Get input DataFile
      DataFile inputDataFile = alignmentsData.getDataFile();

      if (inputDataFile == null) {
          throw new IOException("No input file found.");
      }

      final String dataFileSource;

      if (tsamFormat) {
          dataFileSource = StringUtils.filenameWithoutExtension(inputDataFile.getSource()) + TSAM_EXTENSION;
      } else {
          dataFileSource = inputDataFile.getSource();
      }

      // Set input path
      final Path inputPath = new Path(dataFileSource);

      // Get annotation DataFile
      final DataFile annotationDataFile = featureAnnotationData.getDataFile();

      // Get output file
      final DataFile outFile = outData.getDataFile();

      // Get temporary file
      final DataFile tmpFile = new DataFile(outFile.getParent(), outFile.getBasename() + ".tmp");

      getLogger().fine("sample: " + alignmentsData.getName());
      getLogger().fine("inputPath.getName(): " + inputPath.getName());
      getLogger().fine("annotationDataFile: " + annotationDataFile.getSource());
      getLogger().fine("outFile: " + outFile.getSource());
      getLogger().fine("tmpFile: " + tmpFile.getSource());

      jobConf.set("mapred.child.java.opts", "-Xmx1024m");

      // Set counter group
      jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP);

      // Set Genome description path
      final DataFile genomeDescDataFile = genomeDescriptionData.getDataFile();
      jobConf.set(GENOME_DESC_PATH_KEY, genomeDescDataFile.getSource());

      // Set the "stranded" parameter
      jobConf.set(HTSeqCountMapper.STRANDED_PARAM, stranded.getName());

      // Set the "overlap mode" parameter
      jobConf.set(HTSeqCountMapper.OVERLAP_MODE_PARAM, overlapMode.getName());

      // Set the "remove ambiguous cases" parameter
      jobConf.setBoolean(HTSeqCountMapper.REMOVE_AMBIGUOUS_CASES, removeAmbiguousCases);

      final Path featuresIndexPath = getAnnotationIndexSerializedPath(featureAnnotationData.getDataFile());

      getLogger().info("featuresIndexPath: " + featuresIndexPath);

      // Create serialized feature index
      if (!PathUtils.isFile(featuresIndexPath, jobConf)) {

          final Locker lock = createZookeeperLock(parentConf, context);

          lock.lock();

          createFeaturesIndex(context, annotationDataFile, gtfFormat, genomicType, attributeId,
                  splitAttributeValues, stranded, genomeDescDataFile, featuresIndexPath, jobConf);

          lock.unlock();
      }

      // Create the job and its name
      final Job job = Job.getInstance(jobConf,
              "Expression computation with htseq-count (" + alignmentsData.getName() + ", " + inputPath.getName()
                      + ", " + annotationDataFile.getSource() + ", " + genomicType + ", " + attributeId
                      + ", stranded: " + stranded + ", removeAmbiguousCases: " + removeAmbiguousCases + ")");

      // Set the path to the features index
      job.addCacheFile(featuresIndexPath.toUri());

      // Set the jar
      job.setJarByClass(ExpressionHadoopModule.class);

      // Set input path
      FileInputFormat.setInputPaths(job, inputPath);

      // Set input format
      job.setInputFormatClass(SAMInputFormat.class);

      // Set the mapper class
      job.setMapperClass(HTSeqCountMapper.class);

      // Set the combiner class
      job.setCombinerClass(HTSeqCountReducer.class);

      // Set the reducer class
      job.setReducerClass(HTSeqCountReducer.class);

      // Set the output format
      job.setOutputFormatClass(ExpressionOutputFormat.class);

      // Set the output key class
      job.setOutputKeyClass(Text.class);

      // Set the output value class
      job.setOutputValueClass(LongWritable.class);

      // Set output path
      FileOutputFormat.setOutputPath(job, new Path(tmpFile.getSource()));

      return job;
  }

From source file:fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.FilterAndMapReadsHadoopModule.java

License:LGPL

private Job createJobConf(final Configuration parentConf, final TaskContext context, final String dataName,
        final DataFile inFile, final List<String> filenames, final boolean pairedEnd,
        final DataFormat inputFormat, final FastqFormat fastqFormat, final DataFile genomeIndexFile,
        final DataFile outFile) throws IOException {

    final Configuration jobConf = new Configuration(parentConf);

    // Set input path
    final Path inputPath = new Path(inFile.getSource());

    // Set counter group
    jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, getCounterGroup());

    ////w w  w . j  a  v  a2  s.  c  o  m
    // Reads filters parameters
    //

    // Set fastq format
    jobConf.set(ReadsFilterMapper.FASTQ_FORMAT_KEY, fastqFormat.getName());

    // Set read filter parameters
    addParametersToJobConf(getReadFilterParameters(), READ_FILTER_PARAMETER_KEY_PREFIX, jobConf);

    //
    // Reads mapping parameters
    //

    // Set mapper name
    jobConf.set(ReadsMapperMapper.MAPPER_NAME_KEY, getMapperName());

    // Set mapper version
    jobConf.set(ReadsMapperMapper.MAPPER_VERSION_KEY, getMapperVersion());

    // Set mapper flavor
    jobConf.set(ReadsMapperMapper.MAPPER_FLAVOR_KEY, getMapperFlavor());

    // Set pair end or single end mode
    jobConf.set(ReadsMapperMapper.PAIR_END_KEY, Boolean.toString(pairedEnd));

    // Set the number of threads for the mapper
    if (getMapperHadoopThreads() < 0) {
        jobConf.set(ReadsMapperMapper.MAPPER_THREADS_KEY, "" + getMapperHadoopThreads());
    }

    // Set mapper arguments
    if (getMapperArguments() != null) {
        jobConf.set(ReadsMapperMapper.MAPPER_ARGS_KEY, getMapperArguments());
    }

    // Set Mapper fastq format
    jobConf.set(ReadsMapperMapper.FASTQ_FORMAT_KEY, "" + fastqFormat);

    // Set mapper index checksum
    jobConf.set(ReadsMapperMapper.INDEX_CHECKSUM_KEY, "" + computeZipCheckSum(genomeIndexFile, parentConf));

    // timeout
    jobConf.set("mapreduce.task.timeout", "" + HADOOP_TIMEOUT);

    // Don't reuse JVM
    jobConf.set("mapreduce.job.jvm.numtasks", "" + 1);

    // Set the memory required by the reads mapper
    jobConf.set("mapreduce.map.memory.mb", "" + getMapperHadoopMemoryRequired());

    // Set the memory required by JVM (BWA need more memory than the other
    // mapper for buffering named pipes)
    jobConf.set("mapreduce.map.java.opts", "-Xmx4096M");

    // Set ZooKeeper client configuration
    setZooKeeperJobConfiguration(jobConf, context);

    //
    // Alignment filtering
    //

    // Set SAM filter parameters
    addParametersToJobConf(getAlignmentsFilterParameters(), MAP_FILTER_PARAMETER_KEY_PREFIX, jobConf);

    //
    // Job creation
    //

    // Create the job and its name
    final Job job = Job.getInstance(jobConf,
            "Filter and map reads (" + dataName + ", " + Joiner.on(", ").join(filenames) + ")");

    // Set the jar
    job.setJarByClass(ReadsFilterHadoopModule.class);

    // Set input path
    FileInputFormat.addInputPath(job, inputPath);

    // Add genome mapper index to distributed cache

    // Set genome index reference path in the distributed cache
    final Path genomeIndex = new Path(genomeIndexFile.getSource());
    job.addCacheFile(genomeIndex.toUri());

    // Set the input format
    if (inputFormat == READS_FASTQ) {
        job.setInputFormatClass(FastqInputFormat.class);
    } else {
        job.setInputFormatClass(KeyValueTextInputFormat.class);
    }

    // Set the Mappers classes using a chain mapper
    ChainMapper.addMapper(job, ReadsFilterMapper.class, Text.class, Text.class, Text.class, Text.class,
            jobConf);
    ChainMapper.addMapper(job, ReadsMapperMapper.class, Text.class, Text.class, Text.class, Text.class,
            jobConf);
    ChainMapper.addMapper(job, SAMFilterMapper.class, Text.class, Text.class, Text.class, Text.class, jobConf);

    // Set the reducer class
    job.setReducerClass(SAMFilterReducer.class);

    // Set the output format
    job.setOutputFormatClass(SAMOutputFormat.class);

    // Set the output key class
    job.setOutputKeyClass(Text.class);

    // Set the output value class
    job.setOutputValueClass(Text.class);

    // Set output path
    FileOutputFormat.setOutputPath(job, new Path(outFile.getSource()));

    return job;
}

From source file:fr.ens.biologie.genomique.eoulsan.modules.mapping.hadoop.ReadsMapperHadoopModule.java

License:LGPL

/**
 * Create the JobConf object for a sample.
 * @param parentConf Hadoop configuration
 * @param dataName data name// www .j a v  a2  s .  co  m
 * @param readsFile reads file
 * @param inputFormat inputFormat
 * @param fastqFormat FASTQ format
 * @param mapperIndexFile mapper index file
 * @param outFile output file
 * @return a new JobConf object
 * @throws IOException if an error occurs while creating the job
 */
private Job createJobConf(final Configuration parentConf, final TaskContext context, final String dataName,
        final DataFile readsFile, final boolean pairedEnd, final DataFormat inputFormat,
        final FastqFormat fastqFormat, final DataFile mapperIndexFile, final DataFile outFile)
        throws IOException {

    final Configuration jobConf = new Configuration(parentConf);

    final Path inputPath = new Path(readsFile.getSource());

    // Set mapper name
    jobConf.set(ReadsMapperMapper.MAPPER_NAME_KEY, getMapperName());

    // Set mapper version
    jobConf.set(ReadsMapperMapper.MAPPER_VERSION_KEY, getMapperVersion());

    // Set mapper flavor
    jobConf.set(ReadsMapperMapper.MAPPER_FLAVOR_KEY, getMapperFlavor());

    // Set pair end or single end mode
    jobConf.set(ReadsMapperMapper.PAIR_END_KEY, Boolean.toString(pairedEnd));

    // Set the number of threads for the mapper
    if (getMapperLocalThreads() > 0) {
        jobConf.set(ReadsMapperMapper.MAPPER_THREADS_KEY, "" + getMapperHadoopThreads());
    }

    // Set mapper arguments
    if (getMapperArguments() != null) {
        jobConf.set(ReadsMapperMapper.MAPPER_ARGS_KEY, doubleQuotes(getMapperArguments()));
    }

    // Set Mapper fastq format
    jobConf.set(ReadsMapperMapper.FASTQ_FORMAT_KEY, "" + fastqFormat);

    // Set mapper index checksum
    jobConf.set(ReadsMapperMapper.INDEX_CHECKSUM_KEY, "" + computeZipCheckSum(mapperIndexFile, parentConf));

    // Set counter group
    jobConf.set(CommonHadoop.COUNTER_GROUP_KEY, COUNTER_GROUP);

    // timeout
    jobConf.set("mapreduce.task.timeout", "" + HADOOP_TIMEOUT);

    // No JVM task resuse
    jobConf.set("mapreduce.job.jvm.numtasks", "" + 1);

    // Set the memory required by the reads mapper
    jobConf.set("mapreduce.map.memory.mb", "" + getMapperHadoopMemoryRequired());

    // Set the memory required by JVM (BWA need more memory than the other
    // mapper for buffering named pipes)
    jobConf.set("mapreduce.map.java.opts", "-Xmx4096M");

    // Set ZooKeeper client configuration
    setZooKeeperJobConfiguration(jobConf, context);

    // Create the job and its name
    final Job job = Job.getInstance(jobConf, "Mapping reads in " + fastqFormat + " with " + getMapperName()
            + " (" + dataName + ", " + readsFile.getName() + ")");

    // Set genome index reference path in the distributed cache
    final Path genomeIndex = new Path(mapperIndexFile.getSource());

    job.addCacheFile(genomeIndex.toUri());

    // Set the jar
    job.setJarByClass(ReadsMapperHadoopModule.class);

    // Set input path
    FileInputFormat.addInputPath(job, inputPath);

    // Set the input format
    if (inputFormat == READS_FASTQ) {
        job.setInputFormatClass(FastqInputFormat.class);
    } else {
        job.setInputFormatClass(KeyValueTextInputFormat.class);
    }

    // Set the Mapper class
    job.setMapperClass(ReadsMapperMapper.class);

    // Set the output format
    job.setOutputFormatClass(SAMOutputFormat.class);

    // Set the output key class
    job.setOutputKeyClass(Text.class);

    // Set the output value class
    job.setOutputValueClass(Text.class);

    // Set the number of reducers
    job.setNumReduceTasks(0);

    // Set output path
    FileOutputFormat.setOutputPath(job, new Path(outFile.getSource()));

    return job;
}

From source file:graphcreator.Creator.java

License:Apache License

public static void main(String[] args) throws Exception {

    /*//from   ww w .j ava2 s . c o  m
     * args[0] is the hdfs input path
     * args[1] is the hdfs output path
     * args[2] is the location of the single input file
     * args[3] is the k value for kNNG creation, -1 if a full graph is to be used
     */

    /**
     * conf1 gives a smaller amount of input to each MapReduce job, making it optimal for MR jobs where output
     * or intermediate data volume is significantly larger than input data volume
     */
    Configuration conf1 = new Configuration();
    conf1.set("mapreduce.input.fileinputformat.split.maxsize", "5000");
    conf1.set("mapreduce.job.split.metainfo.maxsize", "-1");
    conf1.set("mapreduce.job.reduces", "100");

    /**
     * conf2 is optimal for MapReduce jobs where data volume is roughly consistent throughout the MR job.
     */
    Configuration conf2 = new Configuration();
    conf2.set("mapreduce.input.fileinputformat.split.maxsize", "5000000");
    conf2.set("mapreduce.job.split.metainfo.maxsize", "-1");
    conf2.set("mapreduce.job.reduces", "100");
    conf2.set("kVal", args[3]);

    /* GRAPH CREATION */
    Job job1 = Job.getInstance(conf1, "edge creation");

    job1.setJarByClass(Creator.class);
    job1.setMapperClass(EdgeCreate.class);
    job1.setReducerClass(DistanceCalc.class);
    job1.setMapOutputKeyClass(Text.class);
    job1.setMapOutputValueClass(Text.class);
    job1.setOutputKeyClass(IntWritable.class);
    job1.setOutputValueClass(Text.class);

    FileInputFormat.addInputPath(job1, new Path(args[0]));
    job1.addCacheFile(new URI("hdfs://localhost:9000/user/hduser/" + args[2]));

    if (!args[3].equals("-1")) {
        FileOutputFormat.setOutputPath(job1, new Path(args[1] + INTERMEDIATE_PATH + "1"));
    } else {
        FileOutputFormat.setOutputPath(job1, new Path(args[1] + "/output"));
    }

    job1.waitForCompletion(true);

    /*
     * Only run KNNG trimming MapReduce jobs if kNNG is desired, as oppused to a complete graph.
     */
    if (!args[3].equals("-1")) {

        Job job2 = Job.getInstance(conf2, "trimming 1");

        job2.setJarByClass(Creator.class);
        job2.setMapperClass(BinByVertex.class);
        job2.setReducerClass(kNNFilter.class);
        job2.setMapOutputKeyClass(Text.class);
        job2.setMapOutputValueClass(Text.class);
        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job2, new Path(args[1] + INTERMEDIATE_PATH + "1"));
        FileOutputFormat.setOutputPath(job2, new Path(args[1] + INTERMEDIATE_PATH + "2"));

        job2.waitForCompletion(true);

        Job job3 = Job.getInstance(conf2, "trimming 2");
        job3.setJarByClass(Creator.class);
        job3.setMapperClass(IdentityMap.class);
        job3.setReducerClass(RemoveDuplicateEdges.class);
        job3.setMapOutputKeyClass(Text.class);
        job3.setMapOutputValueClass(Text.class);
        job3.setOutputKeyClass(Text.class);
        job3.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job3, new Path(args[1] + INTERMEDIATE_PATH + "2"));
        FileOutputFormat.setOutputPath(job3, new Path(args[1] + "/output"));

        job3.waitForCompletion(true);
    }
}

From source file:hadoop.OperatorMapRedMapper.java

public int run(String[] allArgs) throws Exception {
    String[] args = new GenericOptionsParser(getConf(), allArgs).getRemainingArgs();

    Job job = Job.getInstance(getConf());

    FileSystem fs = FileSystem.get(new Configuration());
    fs.exists(new Path(args[2]));
    fs.delete(new Path(args[2]), true);
    job.setJarByClass(OperatorMapRedMapper.class); // necessrio(corrigido)

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);

    job.setMapperClass(MapCatalogToPartition.class);

    job.setReducerClass(OperatorMapRedReducer.class);

    /**//  www  .jav a2  s.  c  o  m
     * args[3] = Nmero de Parties ou nmero de ns
     */
    job.setNumReduceTasks(Integer.valueOf(args[3]));

    FileInputFormat.setInputPaths(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[2]));
    job.addCacheFile(new URI(args[1]));

    job.waitForCompletion(true);

    return 0;
}

From source file:info.halo9pan.word2vec.hadoop.mr.WordSort.java

License:Apache License

public int run(String[] args) throws Exception {
    logger.info("starting");
    Job job = Job.getInstance(getConf());
    Path inputDir = new Path(args[0]);
    Path outputDir = new Path(args[1]);
    boolean useSimplePartitioner = getUseSimplePartitioner(job);
    SortInputFormat.setInputPaths(job, inputDir);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setJobName("WordSort");
    job.setJarByClass(WordSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(SortInputFormat.class);
    job.setOutputFormatClass(SortOutputFormat.class);
    if (useSimplePartitioner) {
        job.setPartitionerClass(SimplePartitioner.class);
    } else {/*from w w w.  ja va2s  .  c o m*/
        long start = System.currentTimeMillis();
        Path partitionFile = new Path(outputDir, SortInputFormat.PARTITION_FILENAME);
        URI partitionUri = new URI(partitionFile.toString() + "#" + SortInputFormat.PARTITION_FILENAME);
        try {
            SortInputFormat.writePartitionFile(job, partitionFile);
        } catch (Throwable e) {
            logger.error(e.getMessage());
            return -1;
        }
        job.addCacheFile(partitionUri);
        long end = System.currentTimeMillis();
        System.out.println("Spent " + (end - start) + "ms computing partitions.");
        job.setPartitionerClass(TotalOrderPartitioner.class);
    }

    job.getConfiguration().setInt("dfs.replication", getOutputReplication(job));
    SortOutputFormat.setFinalSync(job, true);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    logger.info("done");
    return ret;
}

From source file:io.dataapps.chlorine.hadoop.HDFSScanMR.java

License:Apache License

public static Job makeJob(Configuration conf, Path in, Path out, String matchPath, long scanSince,
        String chlorineConfigFilePath, String queue, String maskPath) throws IOException {
    conf.setBoolean("mapred.output.compress", false);
    conf.setLong("scanSince", scanSince);
    conf.set("matchPath", matchPath);
    conf.set("maskPath", maskPath);
    conf.set("inputPath", in.toString());
    if (queue != null) {
        conf.set("mapred.job.queue.name", queue);
    }// w w w  .j ava  2 s .  com
    conf.set("fs.permissions.umask-mode", "007");
    conf.setInt("input_path_depth", in.depth());
    Job job = Job.getInstance(conf, "Chlorine_HDFS_Scan");
    job.setJarByClass(HDFSScanMR.class);
    if (chlorineConfigFilePath != null) {
        try {
            job.addCacheFile(new URI(chlorineConfigFilePath));
            conf.set("finder_file", (new File(chlorineConfigFilePath)).getName());
        } catch (URISyntaxException e) {
            LOG.error(e);
        }
    }
    job.setMapperClass(DeepScanMapper.class);
    job.setNumReduceTasks(0);
    job.setInputFormatClass(TextInputFormat.class);
    TextInputFormat.addInputPath(job, in);
    TextInputFormat.setInputDirRecursive(job, true);
    TextInputFormat.setInputPathFilter(job, NewFilesFilter.class);
    FileOutputFormat.setOutputPath(job, out);
    LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
    return job;
}

From source file:mapreducesentiment.Main.java

@Override
public int run(String[] args) throws Exception {

    Configuration conf = new Configuration();

    //Configuracin de memoria para que ejecuten 16 Maps
    conf.set("mapreduce.map.memory.mb", "1400");
    conf.set("mapreduce.reduce.memory.mb", "2800");
    conf.set("mapreduce.map.java.opts", "-Xmx1120m");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2240m");
    conf.set("yarn.app.mapreduce.am.resource.mb", "2800");
    conf.set("yarn.app.mapreduce.am.command-opts", "-Xmx2240m");
    conf.set("yarn.nodemanager.resource.memory-mb", "5040");
    conf.set("yarn.scheduler.minimum-allocation-mb", "1400");
    conf.set("yarn.scheduler.maximum-allocation-mb", "5040");
    conf.set("mapreduce.task.timeout", "0");//NO timeout

    //Tamao mximo de split para determinar la cantidad de splits/Mappers
    conf.set("mapreduce.input.fileinputformat.split.minsize", "0");
    conf.set("mapreduce.input.fileinputformat.split.maxsize", "104500");//total size / data nodes

    Job job = new Job(conf, "sentiment");

    job.setOutputKeyClass(SentimentKeyWritableComparable.class);
    job.setOutputValueClass(LongWritable.class);

    job.setMapperClass(SentimentMapper.class);
    job.setReducerClass(SentimentReducer.class);

    job.setInputFormatClass(MovieCommentInputFormat.class);

    //Archivo corpus de comentarios se lee desde el blob storage
    FileInputFormat.setInputPaths(job, new Path("wasb:///movies800K.txt"));//args[0]));
    FileOutputFormat.setOutputPath(job, new Path("wasb:///sentiment/test/movies800kb"));//args[1]));

    //Libreras que se copian en cach de cada data node
    job.addCacheFile(new Path("wasb:///ejml-0.23.jar").toUri());
    job.addCacheFile(new Path("wasb:///javax.json.jar").toUri());
    job.addCacheFile(new Path("wasb:///jollyday.jar").toUri());
    job.addCacheFile(new Path("wasb:///stanford-corenlp-3.4.1.jar").toUri());
    job.addCacheFile(new Path("wasb:///stanford-corenlp-3.4.1-models.jar").toUri());
    job.addCacheFile(new Path("wasb:///xom.jar").toUri());

    job.setJarByClass(Main.class);

    job.submit();/*from  w  w w  .  j  av a 2 s.c  om*/
    return 0;
}