Example usage for org.apache.hadoop.mapreduce Job addCacheFile

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce Job addCacheFile.

Prototype

public void addCacheFile(URI uri)

Source Link

Document

Add a file to be localized

Usage

From source file:com.lakhani.anchorgraph.testCache.java

public int run(String[] args) throws Exception {
    Configuration conf = new Configuration();

    Job job = new Job(conf, "testCache");
    job.addCacheFile(new URI("hdfs://zphdc1n1:8020/user/clakhani/anchorgraph/centroids.txt"));
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setMapperClass(Map.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    FileInputFormat.addInputPath(job, new Path(args[0]));
    FileOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setJarByClass(testCache.class);
    job.submit();//from w ww.j av  a2s .  c o  m
    int rc = (job.waitForCompletion(true)) ? 1 : 0;
    return rc;
}

From source file:com.phantom.hadoop.examples.terasort.TeraSort.java

License:Apache License

public int run(String[] args) throws Exception {
    LOG.info("starting");
    Job job = Job.getInstance(getConf());
    Path inputDir = new Path(args[0]);
    Path outputDir = new Path(args[1]);
    boolean useSimplePartitioner = getUseSimplePartitioner(job);
    TeraInputFormat.setInputPaths(job, inputDir);
    FileOutputFormat.setOutputPath(job, outputDir);
    job.setJobName("TeraSort");
    job.setJarByClass(TeraSort.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(TeraInputFormat.class);
    job.setOutputFormatClass(TeraOutputFormat.class);
    if (useSimplePartitioner) {
        job.setPartitionerClass(SimplePartitioner.class);
    } else {//from w w w.  j  ava  2 s.co m
        long start = System.currentTimeMillis();
        Path partitionFile = new Path(outputDir, TeraInputFormat.PARTITION_FILENAME);
        URI partitionUri = new URI(partitionFile.toString() + "#" + TeraInputFormat.PARTITION_FILENAME);
        try {
            TeraInputFormat.writePartitionFile(job, partitionFile);
        } catch (Throwable e) {
            LOG.error(e.getMessage());
            return -1;
        }
        job.addCacheFile(partitionUri);
        long end = System.currentTimeMillis();
        System.out.println("Spent " + (end - start) + "ms computing partitions.");
        job.setPartitionerClass(TotalOrderPartitioner.class);
    }

    job.getConfiguration().setInt("dfs.replication", getOutputReplication(job));
    TeraOutputFormat.setFinalSync(job, true);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    LOG.info("done");
    return ret;
}

From source file:com.xiaomi.linden.hadoop.indexing.job.LindenJob.java

License:Apache License

@Override
public int run(String[] strings) throws Exception {
    Configuration conf = getConf();
    String dir = conf.get(LindenJobConfig.INPUT_DIR, null);
    logger.info("input dir:" + dir);
    Path inputPath = new Path(StringUtils.unEscapeString(dir));
    Path outputPath = new Path(conf.get(LindenJobConfig.OUTPUT_DIR));
    String indexPath = conf.get(LindenJobConfig.INDEX_PATH);

    FileSystem fs = FileSystem.get(conf);
    if (fs.exists(outputPath)) {
        fs.delete(outputPath, true);// ww w.  j  av  a 2 s. co m
    }
    if (fs.exists(new Path(indexPath))) {
        fs.delete(new Path(indexPath), true);
    }

    int numShards = conf.getInt(LindenJobConfig.NUM_SHARDS, 1);
    Shard[] shards = createShards(indexPath, numShards);

    Shard.setIndexShards(conf, shards);

    //empty trash;
    (new Trash(conf)).expunge();

    Job job = Job.getInstance(conf, "linden-hadoop-indexing");
    job.setJarByClass(LindenJob.class);
    job.setMapperClass(LindenMapper.class);
    job.setCombinerClass(LindenCombiner.class);
    job.setReducerClass(LindenReducer.class);
    job.setMapOutputKeyClass(Shard.class);
    job.setMapOutputValueClass(IntermediateForm.class);
    job.setOutputKeyClass(Shard.class);
    job.setOutputValueClass(Text.class);
    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(IndexUpdateOutputFormat.class);
    job.setReduceSpeculativeExecution(false);
    job.setNumReduceTasks(numShards);

    String lindenSchemaFile = conf.get(LindenJobConfig.SCHEMA_FILE_URL);
    if (lindenSchemaFile == null) {
        throw new IOException("no schema file is found");
    }
    logger.info("Adding schema file: " + lindenSchemaFile);
    job.addCacheFile(new URI(lindenSchemaFile + "#lindenSchema"));
    String lindenPropertiesFile = conf.get(LindenJobConfig.LINDEN_PROPERTIES_FILE_URL);
    if (lindenPropertiesFile == null) {
        throw new IOException("no linden properties file is found");
    }
    logger.info("Adding linden properties file: " + lindenPropertiesFile);
    job.addCacheFile(new URI(lindenPropertiesFile + "#lindenProperties"));

    FileInputFormat.setInputPaths(job, inputPath);
    FileOutputFormat.setOutputPath(job, outputPath);

    Path[] inputs = FileInputFormat.getInputPaths(job);
    StringBuilder buffer = new StringBuilder(inputs[0].toString());
    for (int i = 1; i < inputs.length; i++) {
        buffer.append(",");
        buffer.append(inputs[i].toString());
    }
    logger.info("mapreduce.input.dir = " + buffer.toString());
    logger.info("mapreduce.output.dir = " + FileOutputFormat.getOutputPath(job).toString());
    logger.info("mapreduce.job.num.reduce.tasks = " + job.getNumReduceTasks());
    logger.info(shards.length + " shards = " + conf.get(LindenJobConfig.INDEX_SHARDS));
    logger.info("mapreduce.input.format.class = " + job.getInputFormatClass());
    logger.info("mapreduce.output.format.class = " + job.getOutputFormatClass());
    logger.info("mapreduce.cluster.temp.dir = " + conf.get(MRJobConfig.TEMP_DIR));

    job.waitForCompletion(true);
    if (!job.isSuccessful()) {
        throw new RuntimeException("Job failed");
    }
    return 0;
}

From source file:de.tudarmstadt.ukp.experiments.dip.hadoop.OriginalURLGrep.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    org.apache.hadoop.conf.Configuration conf = getConf();
    String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

    System.out.println("Other args: " + Arrays.toString(otherArgs));

    Job job = Job.getInstance();
    job.setJarByClass(OriginalURLGrep.class);

    job.setJobName(OriginalURLGrep.class.getName());
    job.setMapperClass(OrigURLGrepMapper.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NullWritable.class);

    // cache file - IDs for index
    String idFile = args[2];/*from  www. j a v  a  2s.c o  m*/
    System.err.println("idFile: " + idFile);
    job.addCacheFile(new URI(idFile + "#" + NODE_IDS));

    job.setInputFormatClass(TextInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    String commaSeparatedInputFiles = otherArgs[0];
    String outputPath = otherArgs[1];
    System.err.println("commaSeparatedInputFiles: " + commaSeparatedInputFiles);
    System.err.println("outputPath: " + outputPath);

    FileInputFormat.addInputPaths(job, commaSeparatedInputFiles);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:diamondmapreduce.DiamondMapReduce.java

License:Apache License

int launchHamond(String[] arguments) throws Exception {

    //extract diamond, query, reference and output from array
    String diamond = arguments[0];
    String query = arguments[1];//from   w  w  w  . jav a 2 s .  c o m
    String dataBase = arguments[2];
    String outPut = arguments[3];

    //set Hadoop configuration
    Job job = Job.getInstance(getConf(), "DIAMOND");
    Configuration conf = job.getConfiguration();
    SetConf.setHadoopConf(conf);

    //get user name
    userName = HadoopUser.getHadoopUser();

    //delete all existing DIAMOND files under current Hadoop user
    DeleteHDFSFiles.deleteAllFiles(userName);

    //make Hamond directory on HDFS
    MakeHamondHDFSdir.makedir(conf, userName);

    //make DIAMOND database on local then copy to HDFS with query and delete local database
    MakeDB.makeDB(diamond, dataBase);

    //copy DIAMOND bin, query and local database file to HDFS
    CopyFromLocal.copyFromLocal(conf, diamond, query, dataBase, userName);

    //pass query name and database name to mappers
    conf.set(QUERY, query);
    conf.set(DATABASE, dataBase + ".dmnd");
    String[] subArgs = Arrays.copyOfRange(arguments, 4, arguments.length);
    conf.setStrings("DIAMOND-arguments", subArgs);
    conf.setStrings(OUTPUT, outPut);

    //add DIAMOND bin and database into distributed cache
    job.addCacheFile(new URI("/user/" + userName + "/Hamond/diamond"));
    job.addCacheFile(new URI("/user/" + userName + "/Hamond/" + new Path(dataBase).getName() + ".dmnd"));

    //set job input and output paths
    FileInputFormat.addInputPath(job, new Path("/user/" + userName + "/Hamond/" + new Path(query).getName()));
    FileOutputFormat.setOutputPath(job, new Path("/user/" + userName + "/Hamond/out"));

    //set job driver and mapper
    job.setJarByClass(DiamondMapReduce.class);
    job.setMapperClass(DiamondMapper.class);

    //set job input format into customized multilines format
    job.setInputFormatClass(CustomNLineFileInputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setNumReduceTasks(0);

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:diamondmapreduce.DiamondMapReduce.java

License:Apache License

int launchHamondAWS(String[] arguments) throws Exception {

    //extract diamond, query, reference and output from array
    String diamond = arguments[0];
    String query = arguments[1];//from w  ww .  j av  a2s  .c o  m
    String dataBase = arguments[2];
    String outPut = arguments[3];

    //set Hadoop configuration
    Job job = Job.getInstance(getConf(), "DIAMOND");
    Configuration conf = job.getConfiguration();
    SetConf.setHadoopConf(conf);

    //get user name
    userName = HadoopUser.getHadoopUser();

    //delete all existing DIAMOND files under current Hadoop user
    DeleteHDFSFiles.deleteAllFiles(userName);

    //make local Hamond dir
    awshamondsidefunctions.MakeHamondDir.make();

    //copy DIAMOND, query, reference from S3 to master local
    awshamondsidefunctions.CopyFromS3.copyFromS3(diamond, query, dataBase);

    //make Hamond directory on HDFS
    MakeHamondHDFSdir.makedir(conf, userName);

    //make DIAMOND database on local then copy to HDFS with query and delete local database
    MakeDB.makeDB("/mnt/Hamond/diamond", "/mnt/Hamond/" + new Path(dataBase).getName());

    //copy DIAMOND bin, query and local database file to HDFS
    CopyFromLocal.copyFromLocal(conf, "/mnt/Hamond/diamond", "/mnt/Hamond/" + new Path(query).getName(),
            "/mnt/Hamond/" + new Path(dataBase).getName(), userName);

    //pass query name and database name to mappers
    conf.set(QUERY, query);
    conf.set(DATABASE, dataBase);
    conf.set(OUTPUT, outPut);
    String[] subArgs = Arrays.copyOfRange(arguments, 4, arguments.length);
    conf.setStrings("DIAMOND-arguments", subArgs);
    conf.setStrings(OUTPUT, outPut);

    //add DIAMOND bin and database into distributed cache
    job.addCacheFile(new URI("/user/" + userName + "/Hamond/diamond"));
    job.addCacheFile(new URI("/user/" + userName + "/Hamond/" + new Path(dataBase).getName() + ".dmnd"));

    //set job input and output paths
    FileInputFormat.addInputPath(job, new Path("/user/" + userName + "/Hamond/" + new Path(query).getName()));
    FileOutputFormat.setOutputPath(job, new Path("/user/" + userName + "/Hamond/out"));

    //set job driver and mapper
    job.setJarByClass(DiamondMapReduce.class);
    job.setMapperClass(DiamondMapper.class);
    job.setReducerClass(AWSDiamondReducer.class);

    //set job input format into customized multilines format
    job.setInputFormatClass(CustomNLineFileInputFormat.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setNumReduceTasks(1);

    return job.waitForCompletion(true) ? 0 : 1;

}

From source file:dz.lab.mapred.exclude.StartsWithCountJob_DistCacheAPI.java

@Override
public int run(String[] args) throws Exception {
    Configuration conf = getConf();
    // the following property will enable mapreduce to use its packaged local job runner
    //conf.set("mapreduce.framework.name", "local");

    Job job = Job.getInstance(conf, "StartsWithCountJob");
    job.setJarByClass(getClass());//  w  w  w . j  a va  2 s .  c om

    // configure output and input source
    TextInputFormat.addInputPath(job, new Path(args[0]));
    job.setInputFormatClass(TextInputFormat.class);

    // configure mapper and reducer
    job.setMapperClass(StartsWithCountMapper.class);
    job.setCombinerClass(StartsWithCountReducer.class);
    job.setReducerClass(StartsWithCountReducer.class);

    // configure output
    TextOutputFormat.setOutputPath(job, new Path(args[1]));
    job.setOutputFormatClass(TextOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    Path toCache = new Path("/training/data/startWithExcludeFile.txt");
    // add file to cache
    job.addCacheFile(toCache.toUri());
    // create symbolic links for all files in DistributedCache; without the links you would have to use fully qualified path
    job.createSymlink();

    return job.waitForCompletion(true) ? 0 : 1;
}

From source file:edu.gslis.ts.hadoop.ThriftBulkLoader.java

License:Apache License

public int run(String[] args) throws Exception {
    String tableName = args[0];//w w w.  j  a va2s. com
    String inputPath = args[1];
    String outputPath = args[2];
    Path topicsFile = new Path(args[3]);
    Path vocabFile = new Path(args[4]);
    Path dateBinFile = new Path(args[5]);

    Configuration config = getConf();
    config.set("hbase.table.name", tableName);
    HBaseConfiguration.addHbaseResources(config);

    Job job = Job.getInstance(config);
    job.setJarByClass(ThriftBulkLoader.class);
    job.setJobName("Bulk Loading HBase Table::" + tableName);
    job.setInputFormatClass(ThriftFileInputFormat.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapperClass(ThriftFilterMapper.class);

    Path output = new Path(outputPath);
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileInputFormat.setInputDirRecursive(job, true);
    FileOutputFormat.setOutputPath(job, output);

    job.setMapOutputValueClass(Put.class);

    job.addCacheFile(topicsFile.toUri());
    job.addCacheFile(vocabFile.toUri());
    job.addCacheFile(dateBinFile.toUri());

    job.getConfiguration().setBoolean("mapreduce.map.output.compress", true);
    job.getConfiguration().setClass("mapred.map.output.compression.codec",
            org.apache.hadoop.io.compress.SnappyCodec.class,
            org.apache.hadoop.io.compress.CompressionCodec.class);
    job.getConfiguration().set("hfile.compression", Compression.Algorithm.SNAPPY.getName());

    //RegionLocator regionLocator = conn.getRegionLocator(tableName);
    //HFileOutputFormat2.configureIncrementalLoad(job, new HTable(config,tableName));

    Connection con = ConnectionFactory.createConnection(config);
    TableName htableName = TableName.valueOf(tableName);
    HFileOutputFormat2.configureIncrementalLoad(job, con.getTable(htableName),
            con.getRegionLocator(htableName));

    job.waitForCompletion(true);
    if (job.isSuccessful()) {
        // Couldn't find a better way to do this. The LoadIncrementalHFiles
        // seems to want 777 permissions on the output directory.
        try {
            Runtime rt = Runtime.getRuntime();
            rt.exec("hadoop fs -chmod -R 777 " + output);
        } catch (Exception e) {
            e.printStackTrace();
        }
        /*
        LoadIncrementalHFiles loader = new LoadIncrementalHFiles(config);
        HTable htable = new HTable(config, tableName);
        loader.doBulkLoad(new Path(outputPath), htable);
        */

    } else {
        throw new IOException("error with job");
    }

    return 0;

    // - 

    /*
    Job job = Job.getInstance(config);
    job.setJarByClass(ThriftBulkLoader.class);
            
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);  
    job.setMapOutputValueClass(Put.class);  
    job.setInputFormatClass(ThriftFileInputFormat.class);
            
    //HFileOutputFormat2.configureIncrementalLoad(job, htable);
            
    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileInputFormat.setInputDirRecursive(job, true);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));        
            
    job.addCacheFile(topicsFile.toUri());
    job.addCacheFile(vocabFile.toUri());
            
    job.setMapperClass(ThriftFilterMapper.class);
            
    boolean b = job.waitForCompletion(true);
    if (!b) {
    throw new IOException("error with job");
    }
            
    LoadIncrementalHFiles loader = new LoadIncrementalHFiles(config);
    loader.doBulkLoad(new Path(outputPath), htable);
            
    return 0;        
    */
}

From source file:edu.gslis.ts.hadoop.ThriftDumper.java

License:Apache License

public int run(String[] args) throws Exception {
    String inputPath = args[0];/*from w w  w .  j  a v a  2  s.  c o  m*/
    String outputPath = args[1];
    Path topicsFile = new Path(args[2]);
    Path vocabFile = new Path(args[3]);

    Configuration config = getConf();
    Job job = Job.getInstance(config);
    job.setJarByClass(ThriftDumper.class);
    job.setInputFormatClass(ThriftFileInputFormat.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    job.setReducerClass(ThriftDumperReducer.class);

    job.setOutputFormatClass(TextOutputFormat.class);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileInputFormat.setInputDirRecursive(job, true);
    FileOutputFormat.setOutputPath(job, new Path(outputPath));
    job.addCacheFile(topicsFile.toUri());
    job.addCacheFile(vocabFile.toUri());

    job.setMapperClass(ThriftDumperMapper.class);

    boolean b = job.waitForCompletion(true);
    if (!b) {
        throw new IOException("error with job");
    }

    return 0;
}

From source file:edu.gslis.ts.hadoop.ThriftRMScorerHbaseMR.java

License:Apache License

public int run(String[] args) throws Exception {
    String tableName = args[0];/*from w  ww  .  j a  va 2  s. com*/
    Path topicsFile = new Path(args[1]);
    Path vocabFile = new Path(args[2]);
    Path outputPath = new Path(args[3]);
    Path stoplist = new Path(args[4]);
    // String queryId = args[1];

    Configuration config = HBaseConfiguration.create(getConf());
    config.set("hbase.table.name", tableName);
    Job job = Job.getInstance(config);
    job.setJarByClass(ThriftRMScorerHbaseMR.class);

    Scan scan = new Scan();
    scan.setCaching(500);
    scan.setCacheBlocks(false);
    /*
    Filter prefixFilter = new PrefixFilter(Bytes.toBytes(queryId));
    scan.setFilter(prefixFilter);
    */

    TableMapReduceUtil.initTableMapperJob(tableName, scan, ThriftTableMapper.class, IntWritable.class, // mapper output key
            Text.class, // mapper output value
            job);

    job.setReducerClass(ThriftTableReducer.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.addCacheFile(topicsFile.toUri());
    job.addCacheFile(vocabFile.toUri());
    job.addCacheFile(stoplist.toUri());
    FileOutputFormat.setOutputPath(job, outputPath);

    boolean b = job.waitForCompletion(true);
    if (!b) {
        throw new IOException("error with job!");
    }
    return 0;
}