Example usage for org.apache.hadoop.mapred JobConf setSpeculativeExecution

List of usage examples for org.apache.hadoop.mapred JobConf setSpeculativeExecution

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setSpeculativeExecution.

Prototype

public void setSpeculativeExecution(boolean speculativeExecution) 

Source Link

Document

Turn speculative execution on or off for this job.

Usage

From source file:com.test.PiEstimatorKrb.java

License:Apache License

/**
 * Run a map/reduce job for estimating Pi.
 *
 * @return the estimated value of Pi//from  w  w  w . j ava2 s  . c  o m
 */
public static BigDecimal estimate(int numMaps, long numPoints, JobConf jobConf) throws IOException {
    //setup job conf
    jobConf.setJobName(PiEstimatorKrb.class.getSimpleName());

    jobConf.setInputFormat(SequenceFileInputFormat.class);

    jobConf.setOutputKeyClass(BooleanWritable.class);
    jobConf.setOutputValueClass(LongWritable.class);
    jobConf.setOutputFormat(SequenceFileOutputFormat.class);

    jobConf.setMapperClass(PiMapper.class);
    jobConf.setNumMapTasks(numMaps);

    jobConf.setReducerClass(PiReducer.class);
    jobConf.setNumReduceTasks(1);

    // turn off speculative execution, because DFS doesn't handle
    // multiple writers to the same file.
    jobConf.setSpeculativeExecution(false);

    //setup input/output directories
    final Path inDir = new Path(TMP_DIR, "in");
    final Path outDir = new Path(TMP_DIR, "out");
    FileInputFormat.setInputPaths(jobConf, inDir);
    FileOutputFormat.setOutputPath(jobConf, outDir);

    final FileSystem fs = FileSystem.get(jobConf);
    if (fs.exists(TMP_DIR)) {
        throw new IOException(
                "Tmp directory " + fs.makeQualified(TMP_DIR) + " already exists.  Please remove it first.");
    }
    if (!fs.mkdirs(inDir)) {
        throw new IOException("Cannot create input directory " + inDir);
    }

    try {
        //generate an input file for each map task
        for (int i = 0; i < numMaps; ++i) {
            final Path file = new Path(inDir, "part" + i);
            final LongWritable offset = new LongWritable(i * numPoints);
            final LongWritable size = new LongWritable(numPoints);
            final SequenceFile.Writer writer = SequenceFile.createWriter(fs, jobConf, file, LongWritable.class,
                    LongWritable.class, CompressionType.NONE);
            try {
                writer.append(offset, size);
            } finally {
                writer.close();
            }
            sLogger.info("Wrote input for Map #" + i);
        }

        //start a map/reduce job
        sLogger.info("Starting Job");
        final long startTime = System.currentTimeMillis();

        if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) {
            jobConf.set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
        }

        JobClient.runJob(jobConf);
        final double duration = (System.currentTimeMillis() - startTime) / 1000.0;
        sLogger.info("Job Finished in " + duration + " seconds");

        //read outputs
        Path inFile = new Path(outDir, "reduce-out");
        LongWritable numInside = new LongWritable();
        LongWritable numOutside = new LongWritable();
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, jobConf);
        try {
            reader.next(numInside, numOutside);
        } finally {
            reader.close();
        }

        //compute estimated value
        return BigDecimal.valueOf(4).setScale(20).multiply(BigDecimal.valueOf(numInside.get()))
                .divide(BigDecimal.valueOf(numMaps)).divide(BigDecimal.valueOf(numPoints));
    } finally {
        fs.delete(TMP_DIR, true);
    }
}

From source file:edu.umd.cloud9.pagerank.PartitionGraph.java

License:Apache License

public int run(String[] args) throws IOException {
    if (args.length != 5) {
        printUsage();//from  w  ww  .  j a  v a  2 s.c  o  m
        return -1;
    }

    String inPath = args[0];
    String outPath = args[1];
    int numParts = Integer.parseInt(args[2]);
    boolean useRange = Integer.parseInt(args[3]) != 0;
    int nodeCount = Integer.parseInt(args[4]);

    sLogger.info("Tool name: PartitionGraph");
    sLogger.info(" - inputDir: " + inPath);
    sLogger.info(" - outputDir: " + outPath);
    sLogger.info(" - numPartitions: " + numParts);
    sLogger.info(" - useRange?: " + useRange);
    sLogger.info(" - nodeCnt: " + nodeCount);

    JobConf conf = new JobConf(PartitionGraph.class);

    conf.setJobName("Partition Graph " + numParts);
    conf.setNumReduceTasks(numParts);

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("NodeCount", nodeCount);

    FileInputFormat.setInputPaths(conf, new Path(inPath));
    FileOutputFormat.setOutputPath(conf, new Path(outPath));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    conf.setMapperClass(MapClass.class);
    conf.setReducerClass(ReduceClass.class);

    conf.setSpeculativeExecution(false);

    if (useRange) {
        conf.setPartitionerClass(RangePartitioner.class);
    }

    FileSystem.get(conf).delete(new Path(outPath), true);

    JobClient.runJob(conf);

    return 0;
}

From source file:edu.umd.cloud9.pagerank.RunPageRankBasic.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws IOException {
    JobConf conf = new JobConf(RunPageRankBasic.class);

    String in = path + "/iter" + sFormat.format(i);
    String out = path + "/iter" + sFormat.format(j) + "t";
    String outm = out + "-mass";

    // we need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log)
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;/*  www.ja va2 s . co  m*/
    }

    sLogger.info("PageRank: iteration " + j + ": Phase1");
    sLogger.info(" - input: " + in);
    sLogger.info(" - output: " + out);
    sLogger.info(" - nodeCnt: " + n);
    sLogger.info(" - useCombiner: " + useCombiner);
    sLogger.info(" - useInmapCombiner: " + useInmapCombiner);
    sLogger.info(" - useRange: " + useRange);
    sLogger.info("computed number of partitions: " + numPartitions);

    int numMapTasks = numPartitions;
    int numReduceTasks = numPartitions;

    conf.setJobName("PageRank:Basic:iteration" + j + ":Phase1");
    conf.setInt("NodeCount", n);

    conf.setNumMapTasks(numMapTasks);
    conf.setNumReduceTasks(numReduceTasks);

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(PageRankNode.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        conf.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        conf.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        conf.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        conf.setPartitionerClass(RangePartitioner.class);
    }

    conf.setReducerClass(ReduceClass.class);

    conf.setSpeculativeExecution(false);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    JobClient.runJob(conf);

    float mass = Float.NEGATIVE_INFINITY;
    FileSystem fs = FileSystem.get(conf);
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:edu.umd.cloud9.pagerank.RunPageRankSchimmy.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws IOException {
    JobConf conf = new JobConf(RunPageRankBasic.class);

    String in = path + "/iter" + sFormat.format(i);
    String out = path + "/iter" + sFormat.format(j) + "t";
    String outm = out + "-mass";

    FileSystem fs = FileSystem.get(conf);

    // we need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log)
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;/*from w  ww . j  a  v  a  2 s  .c  o  m*/
    }

    conf.setInt("NodeCount", n);

    Partitioner p = null;

    if (useRange) {
        p = new RangePartitioner<IntWritable, Writable>();
        p.configure(conf);
    } else {
        p = new HashPartitioner<WritableComparable, Writable>();
    }

    // this is really annoying: the mapping between the partition numbers on
    // disk (i.e., part-XXXX) and what partition the file contains (i.e.,
    // key.hash % #reducer) is arbitrary... so this means that we need to
    // open up each partition, peek inside to find out.
    IntWritable key = new IntWritable();
    PageRankNode value = new PageRankNode();
    FileStatus[] status = fs.listStatus(new Path(in));

    StringBuilder sb = new StringBuilder();

    for (FileStatus f : status) {
        if (f.getPath().getName().contains("_logs"))
            continue;

        SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf);

        reader.next(key, value);
        int np = p.getPartition(key, value, numPartitions);
        reader.close();

        sLogger.info(f.getPath() + "\t" + np);
        sb.append(np + "=" + f.getPath() + "\t");
    }

    sLogger.info(sb.toString().trim());

    sLogger.info("PageRankSchimmy: iteration " + j + ": Phase1");
    sLogger.info(" - input: " + in);
    sLogger.info(" - output: " + out);
    sLogger.info(" - nodeCnt: " + n);
    sLogger.info(" - useCombiner: " + useCombiner);
    sLogger.info(" - useInmapCombiner: " + useInmapCombiner);
    sLogger.info(" - numPartitions: " + numPartitions);
    sLogger.info(" - useRange: " + useRange);
    sLogger.info("computed number of partitions: " + numPartitions);

    int numMapTasks = numPartitions;
    int numReduceTasks = numPartitions;

    conf.setJobName("PageRankSchimmy:iteration" + j + ":Phase1");

    conf.setNumMapTasks(numMapTasks);
    conf.setNumReduceTasks(numReduceTasks);

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);
    conf.set("BasePath", in);
    conf.set("PartitionMapping", sb.toString().trim());

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(FloatWritable.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        conf.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        conf.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        conf.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        conf.setPartitionerClass(RangePartitioner.class);
    }

    conf.setReducerClass(ReduceClass.class);

    conf.setSpeculativeExecution(false);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    JobClient.runJob(conf);

    float mass = Float.NEGATIVE_INFINITY;
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:ivory.preprocess.BuildTermDocVectors.java

License:Apache License

private void writeDoclengthsData(int collectionDocCount) throws IOException {
    JobConf conf = new JobConf(getConf(), GetTermCount.class);

    String indexPath = conf.get("Ivory.IndexPath");
    String collectionName = conf.get("Ivory.CollectionName");
    int docnoOffset = conf.getInt("Ivory.DocnoOffset", 0);

    FileSystem fs = FileSystem.get(conf);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    Path dlFile = env.getDoclengthsData();
    Path inputPath = env.getDoclengthsDirectory();

    sLogger.info("Writing doc length data to " + dlFile + "...");

    conf.setJobName("DocLengthTable:" + collectionName);

    conf.setInt("Ivory.CollectionDocumentCount", collectionDocCount);
    conf.set("InputPath", inputPath.toString());
    conf.set("DocLengthDataFile", dlFile.toString());
    conf.set("mapred.child.java.opts", "-Xmx4096m");

    conf.setNumMapTasks(1);//w w w .ja v  a2s  . co m
    conf.setNumReduceTasks(0);
    conf.setSpeculativeExecution(false);

    conf.setInputFormat(NullInputFormat.class);
    conf.setOutputFormat(NullOutputFormat.class);
    conf.setMapperClass(DocLengthDataWriterMapper.class);

    RunningJob job = JobClient.runJob(conf);

    env.writeDocnoOffset(docnoOffset);
    Counters counters = job.getCounters();

    long collectionSumOfDocLengths = (long) counters.findCounter(DocLengths.SumOfDocLengths).getCounter();
    env.writeCollectionAverageDocumentLength((float) collectionSumOfDocLengths / collectionDocCount);
}

From source file:map_reduce.MapReduce_OptimizedBrandesAdditions_DO_JUNG.java

License:Open Source License

@SuppressWarnings("deprecation")
@Override/*from  w w  w .j  a  v  a2 s.  c o  m*/
public int run(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("Usage:\n");
        System.exit(1);
    }

    //       Job job = new Job(super.getConf());

    //      READ IN ALL COMMAND LINE ARGUMENTS
    //      EXAMPLE: 
    // hadoop jar MapReduce_OptimizedBrandesAdditions_DO_JUNG.jar
    // -libjars collections-generic-4.01.jar,jung-graph-impl-2.0.1.jar,jung-api-2.0.1.jar
    // -Dmapred.job.map.memory.mb=4096
    // -Dmapred.job.reduce.memory.mb=4096
    // -Dmapred.child.java.opts=-Xmx3500m
    // -Dmapreduce.task.timeout=60000000
    // -Dmapreduce.job.queuename=QUEUENAME
    // input_iterbrandes_additions_nocomb_10k_1 output_iterbrandes_additions_nocomb_10k_1
    // 10 1 10000 55245 10k 10k_randedges 100 1 false times/ betweenness/

    int m = -1;

    // input path to use on hdfs
    Path inputPath = new Path(args[++m]);

    // output path to use on hdfs
    Path outputPath = new Path(args[++m]);

    // number of Mappers to split the sources: e.g., 1, 10, 100 etc.
    // rule of thumb: the larger the graph (i.e., number of roots to test), the larger should be this number.
    int numOfMaps = Integer.parseInt(args[++m]);

    // number of Reducers to collect the output
    int numOfReduce = Integer.parseInt(args[++m]);

    // Number of vertices in graph
    int N = Integer.parseInt(args[++m]);

    // Number of edges in graph
    int M = Integer.parseInt(args[++m]);

    // Graph file (edge list, tab delimited) (full path)
    String graph = args[++m];

    // File with edges to be added (tab delimited) (full path)
    // Note: this version handles only edges between existing vertices in the graph.
    String random_edges = args[++m];

    // Number of random edges added
    int re = Integer.parseInt(args[++m]);

    // Experiment iteration (in case of multiple experiments)
    int iter = Integer.parseInt(args[++m]);

    // Use combiner or not (true/false)
    Boolean comb = Boolean.valueOf(args[++m]);

    // Output path for file with stats
    String statsoutputpath = args[++m];

    // Output path for file with final betweenness values
    String betoutputpath = args[++m];

    //      BEGIN INITIALIZATION

    JobConf conf = new JobConf(getConf(), MapReduce_OptimizedBrandesAdditions_DO_JUNG.class);
    FileSystem fs = FileSystem.get(conf);

    String setup = "_additions_edges" + re + "_maps" + numOfMaps + "_comb" + comb;
    conf.setJobName("OptimizedBrandesAdditionsDOJung_" + graph + setup + "_" + iter);
    conf.set("HDFS_GRAPH", graph + setup);
    conf.set("HDFS_Random_Edges", random_edges + setup);
    conf.set("output", outputPath.getName());
    conf.set("setup", setup);

    //      CREATE INPUT FILES FOR MAPPERS

    int numOfTasksperMap = (int) Math.ceil(N / numOfMaps);
    //generate an input file for each map task
    for (int i = 0; i < numOfMaps - 1; i++) {
        Path file = new Path(inputPath, "part-r-" + i);
        IntWritable start = new IntWritable(i * numOfTasksperMap);
        IntWritable end = new IntWritable((i * numOfTasksperMap) + numOfTasksperMap - 1);

        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class,
                IntWritable.class, CompressionType.NONE);
        try {
            writer.append(start, end);
        } finally {
            writer.close();
        }
        System.out.println("Wrote input for Map #" + i + ": " + start + " - " + end);
    }

    // last mapper takes what is left
    Path file = new Path(inputPath, "part-r-" + (numOfMaps - 1));
    IntWritable start = new IntWritable((numOfMaps - 1) * numOfTasksperMap);
    IntWritable end = new IntWritable(N - 1);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, IntWritable.class,
            CompressionType.NONE);
    try {
        writer.append(start, end);
    } finally {
        writer.close();
    }
    System.out.println("Wrote input for Map #" + (numOfMaps - 1) + ": " + start + " - " + end);

    //      COPY FILES TO MAPPERS
    System.out.println("Copying graph to cache");
    String LOCAL_GRAPH = graph;
    Path hdfsPath = new Path(graph + setup);

    // upload the file to hdfs. Overwrite any existing copy.
    fs.copyFromLocalFile(false, true, new Path(LOCAL_GRAPH), hdfsPath);
    DistributedCache.addCacheFile(hdfsPath.toUri(), conf);

    System.out.println("Copying random edges to cache");
    String LOCAL_Random_Edges = random_edges;
    hdfsPath = new Path(random_edges + setup);

    // upload the file to hdfs. Overwrite any existing copy.
    fs.copyFromLocalFile(false, true, new Path(LOCAL_Random_Edges), hdfsPath);
    DistributedCache.addCacheFile(hdfsPath.toUri(), conf);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setMapperClass(IterBrandesMapper.class);
    conf.setNumMapTasks(numOfMaps);

    if (comb)
        conf.setCombinerClass(IterBrandesReducer.class);

    conf.setReducerClass(IterBrandesReducer.class);
    conf.setNumReduceTasks(numOfReduce);

    // turn off speculative execution, because DFS doesn't handle multiple writers to the same file.
    conf.setSpeculativeExecution(false);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    // conf.set("mapred.job.name", "APS-" + outputPath.getName());
    conf.setNumTasksToExecutePerJvm(-1); // JVM reuse

    System.out.println("Starting the execution...! Pray!! \n");
    long time1 = System.nanoTime();
    RunningJob rj = JobClient.runJob(conf);
    long time2 = System.nanoTime();

    //      READ OUTPUT FILES

    System.out.println("\nFinished and now reading/writing Betweenness Output...\n");

    // Assuming 1 reducer.
    Path inFile = new Path(outputPath, "part-00000");
    IntWritable id = new IntWritable();
    DoubleWritable betweenness = new DoubleWritable();
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, conf);

    FileWriter fw = new FileWriter(new File(betoutputpath + graph + setup + "_betweenness_" + iter));
    try {
        int i = 0;
        for (; i < (N + M + re); i++) {
            reader.next(id, betweenness);
            fw.write(id + "\t" + betweenness + "\n");
            fw.flush();
        }
    } finally {
        reader.close();
        fw.close();
    }

    System.out.println("\nWriting times Output...\n");

    fw = new FileWriter(new File(statsoutputpath + graph + setup + "_times_" + iter));

    fw.write("Total-time:\t" + (time2 - time1) + "\n");
    fw.write("total-map\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("SLOTS_MILLIS_MAPS") + "\n");
    fw.write("total-reduce\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("SLOTS_MILLIS_REDUCES") + "\n");
    fw.write("total-cpu-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("CPU_MILLISECONDS") + "\n");
    fw.write("total-gc-mr\t"
            + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter").getCounter("GC_TIME_MILLIS")
            + "\n");
    fw.write("total-phy-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("PHYSICAL_MEMORY_BYTES") + "\n");
    fw.write("total-vir-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("VIRTUAL_MEMORY_BYTES") + "\n");
    fw.write("brandes\t" + rj.getCounters().getGroup("TimeForBrandes").getCounter("exectime_initial_brandes")
            + "\n");
    fw.write("reduce\t" + rj.getCounters().getGroup("TimeForReduce").getCounter("reduceafteralledges") + "\n");
    fw.flush();

    try {
        Iterator<Counters.Counter> counters = rj.getCounters().getGroup("TimeForRandomEdges").iterator();
        while (counters.hasNext()) {
            Counter cc = counters.next();
            fw.write(cc.getName() + "\t" + cc.getCounter() + "\n");
            fw.flush();
        }
    } finally {
        fw.close();
    }

    return 0;
}

From source file:map_reduce.MapReduce_OptimizedBrandesDeletions_DO_JUNG.java

License:Open Source License

@SuppressWarnings("deprecation")
@Override/*from   w  w w.  j ava  2  s . c om*/
public int run(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("Usage:\n");
        System.exit(1);
    }

    //       Job job = new Job(super.getConf());

    //      READ IN ALL COMMAND LINE ARGUMENTS
    //      EXAMPLE: 
    // hadoop jar MapReduce_OptimizedBrandesDeletions_DO_JUNG.jar
    // -libjars collections-generic-4.01.jar,jung-graph-impl-2.0.1.jar,jung-api-2.0.1.jar
    // -Dmapred.job.map.memory.mb=4096
    // -Dmapred.job.reduce.memory.mb=4096
    // -Dmapred.child.java.opts=-Xmx3500m
    // -Dmapreduce.task.timeout=60000000
    // -Dmapreduce.job.queuename=QUEUENAME
    // input_iterbrandes_deletions_nocomb_10k_1 output_iterbrandes_deletions_nocomb_10k_1
    // 10 1 10000 55245 10k 10k_randedges 100 1 false times/ betweenness/

    int m = -1;

    // input path to use on hdfs
    Path inputPath = new Path(args[++m]);

    // output path to use on hdfs
    Path outputPath = new Path(args[++m]);

    // number of Mappers to split the sources: e.g., 1, 10, 100 etc.
    // rule of thumb: the larger the graph (i.e., number of roots to test), the larger should be this number.
    int numOfMaps = Integer.parseInt(args[++m]);

    // number of Reducers to collect the output
    int numOfReduce = Integer.parseInt(args[++m]);

    // Number of vertices in graph
    int N = Integer.parseInt(args[++m]);

    // Number of edges in graph
    int M = Integer.parseInt(args[++m]);

    // Graph file (edge list, tab delimited) (full path)
    String graph = args[++m];

    // File with edges to be added (tab delimited) (full path)
    // Note: this version handles only edges between existing vertices in the graph.
    String random_edges = args[++m];

    // Number of random edges added
    int re = Integer.parseInt(args[++m]);

    // Experiment iteration (in case of multiple experiments)
    int iter = Integer.parseInt(args[++m]);

    // Use combiner or not (true/false)
    Boolean comb = Boolean.valueOf(args[++m]);

    // Output path for file with stats
    String statsoutputpath = args[++m];

    // Output path for file with final betweenness values
    String betoutputpath = args[++m];

    //      BEGIN INITIALIZATION

    JobConf conf = new JobConf(getConf(), MapReduce_OptimizedBrandesDeletions_DO_JUNG.class);
    FileSystem fs = FileSystem.get(conf);

    String setup = "_deletions_edges" + re + "_maps" + numOfMaps + "_comb" + comb;
    conf.setJobName("OptimizedBrandesDeletionsDOJung_" + graph + setup + "_" + iter);
    conf.set("HDFS_GRAPH", graph + setup);
    conf.set("HDFS_Random_Edges", random_edges + setup);
    conf.set("output", outputPath.getName());
    conf.set("setup", setup);

    //      CREATE INPUT FILES FOR MAPPERS

    int numOfTasksperMap = (int) Math.ceil(N / numOfMaps);
    //generate an input file for each map task
    for (int i = 0; i < numOfMaps - 1; i++) {
        Path file = new Path(inputPath, "part-r-" + i);
        IntWritable start = new IntWritable(i * numOfTasksperMap);
        IntWritable end = new IntWritable((i * numOfTasksperMap) + numOfTasksperMap - 1);

        SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class,
                IntWritable.class, CompressionType.NONE);
        try {
            writer.append(start, end);
        } finally {
            writer.close();
        }
        System.out.println("Wrote input for Map #" + i + ": " + start + " - " + end);
    }

    // last mapper takes what is left
    Path file = new Path(inputPath, "part-r-" + (numOfMaps - 1));
    IntWritable start = new IntWritable((numOfMaps - 1) * numOfTasksperMap);
    IntWritable end = new IntWritable(N - 1);
    SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf, file, IntWritable.class, IntWritable.class,
            CompressionType.NONE);
    try {
        writer.append(start, end);
    } finally {
        writer.close();
    }
    System.out.println("Wrote input for Map #" + (numOfMaps - 1) + ": " + start + " - " + end);

    //      COPY FILES TO MAPPERS
    System.out.println("Copying graph to cache");
    String LOCAL_GRAPH = graph;
    Path hdfsPath = new Path(graph + setup);

    // upload the file to hdfs. Overwrite any existing copy.
    fs.copyFromLocalFile(false, true, new Path(LOCAL_GRAPH), hdfsPath);
    DistributedCache.addCacheFile(hdfsPath.toUri(), conf);

    System.out.println("Copying random edges to cache");
    String LOCAL_Random_Edges = random_edges;
    hdfsPath = new Path(random_edges + setup);

    // upload the file to hdfs. Overwrite any existing copy.
    fs.copyFromLocalFile(false, true, new Path(LOCAL_Random_Edges), hdfsPath);
    DistributedCache.addCacheFile(hdfsPath.toUri(), conf);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(DoubleWritable.class);

    conf.setMapperClass(IterBrandesMapper.class);
    conf.setNumMapTasks(numOfMaps);

    if (comb)
        conf.setCombinerClass(IterBrandesReducer.class);

    conf.setReducerClass(IterBrandesReducer.class);
    conf.setNumReduceTasks(numOfReduce);

    // turn off speculative execution, because DFS doesn't handle multiple writers to the same file.
    conf.setSpeculativeExecution(false);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, outputPath);

    // conf.set("mapred.job.name", "APS-" + outputPath.getName());
    conf.setNumTasksToExecutePerJvm(-1); // JVM reuse

    System.out.println("Starting the execution...! Pray!! \n");
    long time1 = System.nanoTime();
    RunningJob rj = JobClient.runJob(conf);
    long time2 = System.nanoTime();

    //      READ OUTPUT FILES

    System.out.println("\nFinished and now reading/writing Betweenness Output...\n");

    // Assuming 1 reducer.
    Path inFile = new Path(outputPath, "part-00000");
    IntWritable id = new IntWritable();
    DoubleWritable betweenness = new DoubleWritable();
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, inFile, conf);

    FileWriter fw = new FileWriter(new File(betoutputpath + graph + setup + "_betweenness_" + iter));
    try {
        int i = 0;
        for (; i < (N + (M - re)); i++) {
            reader.next(id, betweenness);
            fw.write(id + "\t" + betweenness + "\n");
            fw.flush();
        }
    } finally {
        reader.close();
        fw.close();
    }

    System.out.println("\nWriting times Output...\n");

    fw = new FileWriter(new File(statsoutputpath + graph + setup + "_times_" + iter));

    fw.write("Total-time:\t" + (time2 - time1) + "\n");
    fw.write("total-map\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("SLOTS_MILLIS_MAPS") + "\n");
    fw.write("total-reduce\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("SLOTS_MILLIS_REDUCES") + "\n");
    fw.write("total-cpu-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("CPU_MILLISECONDS") + "\n");
    fw.write("total-gc-mr\t"
            + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter").getCounter("GC_TIME_MILLIS")
            + "\n");
    fw.write("total-phy-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("PHYSICAL_MEMORY_BYTES") + "\n");
    fw.write("total-vir-mem-mr\t" + rj.getCounters().getGroup("org.apache.hadoop.mapreduce.TaskCounter")
            .getCounter("VIRTUAL_MEMORY_BYTES") + "\n");
    fw.write("brandes\t" + rj.getCounters().getGroup("TimeForBrandes").getCounter("exectime_initial_brandes")
            + "\n");
    fw.write("reduce\t" + rj.getCounters().getGroup("TimeForReduce").getCounter("reduceafteralledges") + "\n");
    fw.flush();

    try {
        Iterator<Counters.Counter> counters = rj.getCounters().getGroup("TimeForRandomEdges").iterator();
        while (counters.hasNext()) {
            Counter cc = counters.next();
            fw.write(cc.getName() + "\t" + cc.getCounter() + "\n");
            fw.flush();
        }
    } finally {
        fw.close();
    }

    return 0;
}

From source file:net.peacesoft.nutch.crawl.ReFetcher.java

License:Apache License

public void fetch(Path segment, int threads) throws IOException {

    checkConfiguration();//  ww  w  .j  a  v  a  2s. co  m

    SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
    long start = System.currentTimeMillis();
    if (LOG.isInfoEnabled()) {
        LOG.info("Fetcher: starting at " + sdf.format(start));
        LOG.info("Fetcher: segment: " + segment);
    }

    // set the actual time for the timelimit relative
    // to the beginning of the whole job and not of a specific task
    // otherwise it keeps trying again if a task fails
    long timelimit = getConf().getLong("ReFetcher.timelimit.mins", -1);
    if (timelimit != -1) {
        timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
        LOG.info("Fetcher Timelimit set for : " + timelimit);
        getConf().setLong("fetcher.timelimit", timelimit);
    }

    // Set the time limit after which the throughput threshold feature is enabled
    timelimit = getConf().getLong("fetcher.throughput.threshold.check.after", 10);
    timelimit = System.currentTimeMillis() + (timelimit * 60 * 1000);
    getConf().setLong("fetcher.throughput.threshold.check.after", timelimit);

    int maxOutlinkDepth = getConf().getInt("fetcher.follow.outlinks.depth", -1);
    if (maxOutlinkDepth > 0) {
        LOG.info("Fetcher: following outlinks up to depth: " + Integer.toString(maxOutlinkDepth));

        int maxOutlinkDepthNumLinks = getConf().getInt("fetcher.follow.outlinks.num.links", 4);
        int outlinksDepthDivisor = getConf().getInt("fetcher.follow.outlinks.depth.divisor", 2);

        int totalOutlinksToFollow = 0;
        for (int i = 0; i < maxOutlinkDepth; i++) {
            totalOutlinksToFollow += (int) Math.floor(outlinksDepthDivisor / (i + 1) * maxOutlinkDepthNumLinks);
        }

        LOG.info("Fetcher: maximum outlinks to follow: " + Integer.toString(totalOutlinksToFollow));
    }

    JobConf job = new NutchJob(getConf());
    job.setJobName("fetch " + segment);

    job.setInt("fetcher.threads.fetch", threads);
    job.set(Nutch.SEGMENT_NAME_KEY, segment.getName());

    // for politeness, don't permit parallel execution of a single task
    job.setSpeculativeExecution(false);

    FileInputFormat.addInputPath(job, new Path(segment, CrawlDatum.GENERATE_DIR_NAME));
    job.setInputFormat(ReFetcher.InputFormat.class);

    job.setMapRunnerClass(ReFetcher.class);

    FileOutputFormat.setOutputPath(job, segment);
    job.setOutputFormat(FetcherOutputFormat.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(NutchWritable.class);

    JobClient.runJob(job);

    long end = System.currentTimeMillis();
    LOG.info("Fetcher: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end));
}

From source file:net.sf.katta.indexing.IndexerJob.java

License:Apache License

public void startIndexer(String path, String finalDestination, int numOfShards) throws IOException {
    // create job conf with class pointing into job jar.
    JobConf jobConf = new JobConf(IndexerJob.class);
    jobConf.setJobName("indexer");
    jobConf.setMapRunnerClass(Indexer.class);
    // alternative use a text file and a TextInputFormat
    jobConf.setInputFormat(SequenceFileInputFormat.class);

    Path input = new Path(path);
    FileInputFormat.setInputPaths(jobConf, input);
    // we just set the output path to make hadoop happy.
    FileOutputFormat.setOutputPath(jobConf, new Path(finalDestination));
    // setting the folder where lucene indexes will be copied when finished.
    jobConf.set("finalDestination", finalDestination);
    // important to switch spec exec off.
    // We dont want to have something duplicated.
    jobConf.setSpeculativeExecution(false);

    // The num of map tasks is equal to the num of input splits.
    // The num of input splits by default is equal to the num of hdf blocks
    // for the input file(s). To get the right num of shards we need to
    // calculate the best input split size.

    FileSystem fs = FileSystem.get(input.toUri(), jobConf);
    FileStatus[] status = fs.globStatus(input);
    long size = 0;
    for (FileStatus fileStatus : status) {
        size += fileStatus.getLen();//from  w w  w  .  j  av  a 2 s .c o m
    }
    long optimalSplisize = size / numOfShards;
    jobConf.set("mapred.min.split.size", "" + optimalSplisize);

    // give more mem to lucene tasks.
    jobConf.set("mapred.child.java.opts", "-Xmx2G");
    jobConf.setNumMapTasks(1);
    jobConf.setNumReduceTasks(0);
    JobClient.runJob(jobConf);
}

From source file:org.apache.ambari.servicemonitor.jobs.FileUsingJobRunner.java

License:Apache License

public int run(String[] args) throws Exception {
    // Configuration processed by ToolRunner
    Configuration conf = getConf();

    CommandLine commandLine = getCommandLine();
    // Create a JobConf using the processed conf
    JobConf jobConf = new JobConf(conf, FileUsingJobRunner.class);

    //tune the config
    if (jobConf.get(JobKeys.RANGEINPUTFORMAT_ROWS) == null) {
        jobConf.setInt(JobKeys.RANGEINPUTFORMAT_ROWS, 1);
    }/*from  w w w . j a  v a2s . c o  m*/

    // Process custom command-line options
    String name = OptionHelper.getStringOption(commandLine, "n", "File Using Job");
    if (commandLine.hasOption('x')) {
        //delete the output directory
        String destDir = jobConf.get(JobKeys.MAPRED_OUTPUT_DIR);
        FileSystem fs = FileSystem.get(jobConf);
        fs.delete(new Path(destDir), true);
    }

    // Specify various job-specific parameters     
    jobConf.setMapperClass(FileUsingMapper.class);
    jobConf.setReducerClass(FileUsingReducer.class);
    jobConf.setMapOutputKeyClass(IntWritable.class);
    jobConf.setMapOutputValueClass(IntWritable.class);
    jobConf.setOutputFormat(TextOutputFormat.class);
    jobConf.setInputFormat(RangeInputFormat.class);
    //jobConf.setPartitionerClass(SleepJob.class);
    jobConf.setSpeculativeExecution(false);
    jobConf.setJobName(name);
    jobConf.setJarByClass(this.getClass());
    FileInputFormat.addInputPath(jobConf, new Path("ignored"));

    // Submit the job, then poll for progress until the job is complete
    RunningJob runningJob = JobClient.runJob(jobConf);
    runningJob.waitForCompletion();
    return runningJob.isSuccessful() ? 0 : 1;
}