Example usage for org.apache.hadoop.mapred JobConf setPartitionerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setPartitionerClass.

Prototype

public void setPartitionerClass(Class<? extends Partitioner> theClass)

Source Link

Document

Set the Partitioner class used to partition Mapper -outputs to be sent to the Reducer s.

Usage

From source file:edu.umd.cloud9.pagerank.RunPageRankSchimmy.java

License:Apache License

private float phase1(String path, int i, int j, int n, boolean useCombiner, boolean useInmapCombiner,
        boolean useRange) throws IOException {
    JobConf conf = new JobConf(RunPageRankBasic.class);

    String in = path + "/iter" + sFormat.format(i);
    String out = path + "/iter" + sFormat.format(j) + "t";
    String outm = out + "-mass";

    FileSystem fs = FileSystem.get(conf);

    // we need to actually count the number of part files to get the number
    // of partitions (because the directory might contain _log)
    int numPartitions = 0;
    for (FileStatus s : FileSystem.get(conf).listStatus(new Path(in))) {
        if (s.getPath().getName().contains("part-"))
            numPartitions++;//from w w w. j ava 2s. c  om
    }

    conf.setInt("NodeCount", n);

    Partitioner p = null;

    if (useRange) {
        p = new RangePartitioner<IntWritable, Writable>();
        p.configure(conf);
    } else {
        p = new HashPartitioner<WritableComparable, Writable>();
    }

    // this is really annoying: the mapping between the partition numbers on
    // disk (i.e., part-XXXX) and what partition the file contains (i.e.,
    // key.hash % #reducer) is arbitrary... so this means that we need to
    // open up each partition, peek inside to find out.
    IntWritable key = new IntWritable();
    PageRankNode value = new PageRankNode();
    FileStatus[] status = fs.listStatus(new Path(in));

    StringBuilder sb = new StringBuilder();

    for (FileStatus f : status) {
        if (f.getPath().getName().contains("_logs"))
            continue;

        SequenceFile.Reader reader = new SequenceFile.Reader(fs, f.getPath(), conf);

        reader.next(key, value);
        int np = p.getPartition(key, value, numPartitions);
        reader.close();

        sLogger.info(f.getPath() + "\t" + np);
        sb.append(np + "=" + f.getPath() + "\t");
    }

    sLogger.info(sb.toString().trim());

    sLogger.info("PageRankSchimmy: iteration " + j + ": Phase1");
    sLogger.info(" - input: " + in);
    sLogger.info(" - output: " + out);
    sLogger.info(" - nodeCnt: " + n);
    sLogger.info(" - useCombiner: " + useCombiner);
    sLogger.info(" - useInmapCombiner: " + useInmapCombiner);
    sLogger.info(" - numPartitions: " + numPartitions);
    sLogger.info(" - useRange: " + useRange);
    sLogger.info("computed number of partitions: " + numPartitions);

    int numMapTasks = numPartitions;
    int numReduceTasks = numPartitions;

    conf.setJobName("PageRankSchimmy:iteration" + j + ":Phase1");

    conf.setNumMapTasks(numMapTasks);
    conf.setNumReduceTasks(numReduceTasks);

    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.set("PageRankMassPath", outm);
    conf.set("BasePath", in);
    conf.set("PartitionMapping", sb.toString().trim());

    FileInputFormat.setInputPaths(conf, new Path(in));
    FileOutputFormat.setOutputPath(conf, new Path(out));

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(FloatWritable.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PageRankNode.class);

    if (useInmapCombiner) {
        conf.setMapperClass(MapWithInMapperCombiningClass.class);
    } else {
        conf.setMapperClass(MapClass.class);
    }

    if (useCombiner) {
        conf.setCombinerClass(CombineClass.class);
    }

    if (useRange) {
        conf.setPartitionerClass(RangePartitioner.class);
    }

    conf.setReducerClass(ReduceClass.class);

    conf.setSpeculativeExecution(false);

    FileSystem.get(conf).delete(new Path(out), true);
    FileSystem.get(conf).delete(new Path(outm), true);

    JobClient.runJob(conf);

    float mass = Float.NEGATIVE_INFINITY;
    for (FileStatus f : fs.listStatus(new Path(outm))) {
        FSDataInputStream fin = fs.open(f.getPath());
        mass = sumLogProbs(mass, fin.readFloat());
        fin.close();
    }

    return mass;
}

From source file:edu.umd.cloud9.webgraph.CollectHostnames.java

License:Apache License

public int runTool() throws Exception {

    JobConf conf = new JobConf(getConf(), CollectHostnames.class);
    FileSystem fs = FileSystem.get(conf);

    int numMappers = conf.getInt("Cloud9.Mappers", 1);
    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");

    conf.setJobName("CollectHostnames");
    conf.set("mapred.child.java.opts", "-Xmx4096m");
    conf.setInt("mapred.task.timeout", 60000000);

    conf.setNumMapTasks(numMappers);//  ww  w.  j  av  a2  s  . c  o m
    conf.setNumReduceTasks(numReducers);

    conf.setMapperClass(Map.class);
    conf.setPartitionerClass(Partition.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(ArrayListWritable.class);

    conf.setMapOutputKeyClass(PairOfIntString.class);
    conf.setMapOutputValueClass(IntWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);

    SequenceFileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    sLogger.info("PropagateHostname");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);

    if (!fs.exists(new Path(outputPath))) {
        JobClient.runJob(conf);
    } else {
        sLogger.info(outputPath + " already exists! Skipping this step...");
    }

    return 0;
}

From source file:edu.umd.cloud9.webgraph.ComputeWeight.java

License:Apache License

public int runTool() throws Exception {

    JobConf conf = new JobConf(getConf(), ComputeWeight.class);
    FileSystem fs = FileSystem.get(conf);

    int numMappers = conf.getInt("Cloud9.Mappers", 1);
    int numReducers = conf.getInt("Cloud9.Reducers", 200);

    String inputPath = conf.get("Cloud9.InputPath");
    String outputPath = conf.get("Cloud9.OutputPath");

    conf.setJobName("ComputeWeights");
    conf.set("mapred.child.java.opts", "-Xmx4096m");
    conf.setInt("mapred.task.timeout", 60000000);

    conf.setNumMapTasks(numMappers);//w w  w . j  av  a  2  s.  c o  m
    conf.setNumReduceTasks(numReducers);

    conf.setMapperClass(Map.class);
    conf.setPartitionerClass(Partition.class);
    conf.setReducerClass(Reduce.class);

    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(ArrayListWritable.class);

    conf.setMapOutputKeyClass(PairOfInts.class);
    conf.setMapOutputValueClass(ArrayListWritable.class);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);

    SequenceFileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    LOG.info("ComputeWeight");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);

    if (!fs.exists(new Path(outputPath))) {
        JobClient.runJob(conf);
    } else {
        LOG.info(outputPath + " already exists! Skipping this step...");
    }

    return 0;
}

From source file:edu.umd.cloud9.webgraph.driver.SortWebGraph.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length != 4) {
        printUsage();/*from  ww w  .  j  a  v  a  2 s .  c o m*/
        return -1;
    }

    JobConf conf = new JobConf(getConf(), SortWebGraph.class);
    FileSystem fs = FileSystem.get(conf);

    String inputPath = args[0];
    String outputPath = args[1];
    int numberOfDocuments = Integer.parseInt(args[2]);
    int numMappers = 1;
    int numReducers = Integer.parseInt(args[3]);

    conf.setJobName("SortWebGraph");
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("mapred.task.timeout", 60000000);
    conf.set("mapreduce.map.memory.mb", "2048");
    conf.set("mapreduce.map.java.opts", "-Xmx2048m");
    conf.set("mapreduce.reduce.memory.mb", "2048");
    conf.set("mapreduce.reduce.java.opts", "-Xmx2048m");
    conf.set("mapreduce.task.timeout", "60000000");

    if (numberOfDocuments == 0) {
        numberOfDocuments = DEFAULT_NUMBER_OF_DOCUMENTS;
    }
    conf.setInt("Cloud9.NumberOfDocuments", numberOfDocuments);
    conf.setNumMapTasks(numMappers);
    conf.setNumReduceTasks(numReducers);
    conf.setMapperClass(IdentityMapper.class);
    conf.setPartitionerClass(Partition.class);
    conf.setReducerClass(IdentityReducer.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(ArrayListWritable.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(ArrayListWritable.class);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setCompressOutput(conf, true);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.BLOCK);
    SequenceFileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));

    LOG.info("SortAnchorText");
    LOG.info(" - input path: " + inputPath);
    LOG.info(" - output path: " + outputPath);
    LOG.info(" - number of documents: " + conf.getInt("Cloud9.NumberOfDocuments", DEFAULT_NUMBER_OF_DOCUMENTS));
    fs.delete(new Path(outputPath));
    JobClient.runJob(conf);
    return 0;
}

From source file:IndexService.IndexMergeMR.java

License:Open Source License

public static RunningJob run(String inputfiles, String outputdir, Configuration conf) {
    if (inputfiles == null || outputdir == null)
        return null;

    JobConf job = new JobConf(conf);
    job.setJobName("MergeIndexMR");
    job.setJarByClass(IndexMergeMR.class);
    job.setNumReduceTasks(1);//from  w  w w. ja  va  2s. co  m
    FileSystem fs = null;
    try {
        fs = FileSystem.get(job);
        fs.delete(new Path(outputdir), true);

        String[] ifs = inputfiles.split(",");
        TreeSet<String> files = new TreeSet<String>();
        for (int i = 0; i < ifs.length; i++) {
            IFormatDataFile ifdf = new IFormatDataFile(job);
            ifdf.open(ifs[i]);
            Collection<String> strs = ifdf.fileInfo().head().getUdi().infos().values();
            for (String str : strs) {
                files.add(str);
            }
            ifdf.close();
        }
        StringBuffer sb = new StringBuffer();
        for (String str : files) {
            sb.append(str + ",");
        }
        job.set(ConstVar.HD_index_filemap, sb.substring(0, sb.length() - 1));

        IFormatDataFile ifdf = new IFormatDataFile(job);
        ifdf.open(ifs[0]);

        HashMap<Integer, IRecord.IFType> map = ifdf.fileInfo().head().fieldMap().fieldtypes();
        ArrayList<String> fieldStrings = new ArrayList<String>();

        for (int i = 0; i < map.size(); i++) {
            IRecord.IFType type = map.get(i);
            fieldStrings.add(type.type() + ConstVar.RecordSplit + type.idx());
        }

        job.setStrings(ConstVar.HD_fieldMap, fieldStrings.toArray(new String[fieldStrings.size()]));
        job.set("datafiletype", ifdf.fileInfo().head().getUdi().infos().get(123456));
        ifdf.close();
    } catch (Exception e2) {
        e2.printStackTrace();
    }

    FileInputFormat.setInputPaths(job, inputfiles);
    FileOutputFormat.setOutputPath(job, new Path(outputdir));

    job.setOutputKeyClass(IndexKey.class);
    job.setOutputValueClass(IndexValue.class);

    job.setPartitionerClass(IndexMergePartitioner.class);

    job.setMapperClass(MergeIndexMap.class);
    job.setCombinerClass(MergeIndexReduce.class);
    job.setReducerClass(MergeIndexReduce.class);

    job.setInputFormat(IndexMergeIFormatInputFormat.class);
    job.setOutputFormat(IndexMergeIFormatOutputFormat.class);

    try {
        JobClient jc = new JobClient(job);
        return jc.submitJob(job);
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:IndexService.IndexMR.java

License:Open Source License

public static RunningJob run(Configuration conf2, String inputfiles, boolean column, String ids,
        String outputdir) {/*from ww  w.j  av  a  2  s . com*/
    if (inputfiles == null || outputdir == null)
        return null;

    JobConf conf = new JobConf(conf2);
    conf.setJobName("IndexMR:\t" + ids);
    conf.setJarByClass(IndexMR.class);
    FileSystem fs = null;
    try {
        fs = FileSystem.get(conf);
        fs.delete(new Path(outputdir), true);
    } catch (IOException e3) {
        e3.printStackTrace();
    }

    conf.set("index.ids", ids);
    if (column) {
        conf.set("datafiletype", "column");
    } else {
        conf.set("datafiletype", "format");
    }

    String[] ifs = inputfiles.split(",");
    long wholerecnum = 0;

    String[] idxs = ids.split(",");
    String[] fieldStrings = new String[idxs.length + 2];

    if (!column) {
        IFormatDataFile ifdf;
        try {
            ifdf = new IFormatDataFile(conf);
            ifdf.open(ifs[0]);
            for (int i = 0; i < idxs.length; i++) {
                int id = Integer.parseInt(idxs[i]);
                byte type = ifdf.fileInfo().head().fieldMap().fieldtypes().get(id).type();
                fieldStrings[i] = type + ConstVar.RecordSplit + i;
            }
            ifdf.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    } else {
        try {
            IColumnDataFile icdf = new IColumnDataFile(conf);
            icdf.open(ifs[0]);
            for (int i = 0; i < idxs.length; i++) {
                int id = Integer.parseInt(idxs[i]);
                byte type = icdf.fieldtypes().get(id).type();
                fieldStrings[i] = type + ConstVar.RecordSplit + i;
            }
            icdf.close();
        } catch (IOException e) {
            e.printStackTrace();
        }

    }

    fieldStrings[fieldStrings.length - 2] = ConstVar.FieldType_Short + ConstVar.RecordSplit
            + (fieldStrings.length - 2);
    fieldStrings[fieldStrings.length - 1] = ConstVar.FieldType_Int + ConstVar.RecordSplit
            + (fieldStrings.length - 1);

    conf.setStrings(ConstVar.HD_fieldMap, fieldStrings);

    if (!column) {
        conf.set(ConstVar.HD_index_filemap, inputfiles);
        for (String file : ifs) {
            IFormatDataFile fff;
            try {
                fff = new IFormatDataFile(conf);
                fff.open(file);
                wholerecnum += fff.segIndex().recnum();
                fff.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    } else {
        HashSet<String> files = new HashSet<String>();
        for (String file : ifs) {
            files.add(file);
        }
        StringBuffer sb = new StringBuffer();
        for (String str : files) {
            sb.append(str).append(",");
        }
        conf.set(ConstVar.HD_index_filemap, sb.substring(0, sb.length() - 1));

        for (String file : files) {
            Path parent = new Path(file).getParent();
            try {
                FileStatus[] fss = fs.listStatus(parent);
                String openfile = "";
                for (FileStatus status : fss) {
                    if (status.getPath().toString().contains(file)) {
                        openfile = status.getPath().toString();
                        break;
                    }
                }
                IFormatDataFile fff = new IFormatDataFile(conf);
                fff.open(openfile);
                wholerecnum += fff.segIndex().recnum();
                fff.close();

            } catch (IOException e) {
                e.printStackTrace();
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

    conf.setNumReduceTasks((int) ((wholerecnum - 1) / (100000000) + 1));

    FileInputFormat.setInputPaths(conf, inputfiles);
    Path outputPath = new Path(outputdir);
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.setOutputKeyClass(IndexKey.class);
    conf.setOutputValueClass(IndexValue.class);

    conf.setPartitionerClass(IndexPartitioner.class);

    conf.setMapperClass(IndexMap.class);
    conf.setCombinerClass(IndexReduce.class);
    conf.setReducerClass(IndexReduce.class);

    if (column) {
        conf.setInputFormat(IColumnInputFormat.class);
    } else {
        conf.setInputFormat(IFormatInputFormat.class);
    }
    conf.setOutputFormat(IndexIFormatOutputFormat.class);

    try {
        JobClient jc = new JobClient(conf);
        return jc.submitJob(conf);
    } catch (IOException e) {
        e.printStackTrace();
        return null;
    }
}

From source file:it.crs4.pydoop.pipes.Submitter.java

License:Apache License

private static void setupPipesJob(JobConf conf) throws IOException {
    // default map output types to Text
    if (!getIsJavaMapper(conf)) {
        conf.setMapRunnerClass(PipesMapRunner.class);
        // Save the user's partitioner and hook in our's.
        setJavaPartitioner(conf, conf.getPartitionerClass());
        conf.setPartitionerClass(PipesPartitioner.class);
    }/*from   ww w . j a  va2 s .  c o m*/
    if (!getIsJavaReducer(conf)) {
        conf.setReducerClass(PipesReducer.class);
        if (!getIsJavaRecordWriter(conf)) {
            conf.setOutputFormat(NullOutputFormat.class);
        }
    }
    String textClassname = Text.class.getName();
    setIfUnset(conf, MRJobConfig.MAP_OUTPUT_KEY_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.MAP_OUTPUT_VALUE_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.OUTPUT_KEY_CLASS, textClassname);
    setIfUnset(conf, MRJobConfig.OUTPUT_VALUE_CLASS, textClassname);

    // Use PipesNonJavaInputFormat if necessary to handle progress reporting
    // from C++ RecordReaders ...
    if (!getIsJavaRecordReader(conf) && !getIsJavaMapper(conf)) {
        conf.setClass(Submitter.INPUT_FORMAT, conf.getInputFormat().getClass(), InputFormat.class);
        conf.setInputFormat(PipesNonJavaInputFormat.class);
    }

    String exec = getExecutable(conf);
    if (exec == null) {
        throw new IllegalArgumentException("No application program defined.");
    }
    // add default debug script only when executable is expressed as
    // <path>#<executable>
    if (exec.contains("#")) {
        // set default gdb commands for map and reduce task 
        String defScript = "$HADOOP_PREFIX/src/c++/pipes/debug/pipes-default-script";
        setIfUnset(conf, MRJobConfig.MAP_DEBUG_SCRIPT, defScript);
        setIfUnset(conf, MRJobConfig.REDUCE_DEBUG_SCRIPT, defScript);
    }
    URI[] fileCache = DistributedCache.getCacheFiles(conf);
    if (fileCache == null) {
        fileCache = new URI[1];
    } else {
        URI[] tmp = new URI[fileCache.length + 1];
        System.arraycopy(fileCache, 0, tmp, 1, fileCache.length);
        fileCache = tmp;
    }
    try {
        fileCache[0] = new URI(exec);
    } catch (URISyntaxException e) {
        IOException ie = new IOException("Problem parsing execable URI " + exec);
        ie.initCause(e);
        throw ie;
    }
    DistributedCache.setCacheFiles(fileCache, conf);
}

From source file:it.crs4.pydoop.pipes.Submitter.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    CommandLineParser cli = new CommandLineParser();
    if (args.length == 0) {
        cli.printUsage();//from   w  ww. ja v a2s.com
        return 1;
    }
    cli.addOption("input", false, "input path to the maps", "path");
    cli.addOption("output", false, "output path from the reduces", "path");

    cli.addOption("jar", false, "job jar file", "path");
    cli.addOption("inputformat", false, "java classname of InputFormat", "class");
    //cli.addArgument("javareader", false, "is the RecordReader in Java");
    cli.addOption("map", false, "java classname of Mapper", "class");
    cli.addOption("partitioner", false, "java classname of Partitioner", "class");
    cli.addOption("reduce", false, "java classname of Reducer", "class");
    cli.addOption("writer", false, "java classname of OutputFormat", "class");
    cli.addOption("program", false, "URI to application executable", "class");
    cli.addOption("reduces", false, "number of reduces", "num");
    cli.addOption("jobconf", false,
            "\"n1=v1,n2=v2,..\" (Deprecated) Optional. Add or override a JobConf property.", "key=val");
    cli.addOption("lazyOutput", false, "Optional. Create output lazily", "boolean");
    Parser parser = cli.createParser();
    try {

        GenericOptionsParser genericParser = new GenericOptionsParser(getConf(), args);
        CommandLine results = parser.parse(cli.options, genericParser.getRemainingArgs());

        JobConf job = new JobConf(getConf());

        if (results.hasOption("input")) {
            FileInputFormat.setInputPaths(job, results.getOptionValue("input"));
        }
        if (results.hasOption("output")) {
            FileOutputFormat.setOutputPath(job, new Path(results.getOptionValue("output")));
        }
        if (results.hasOption("jar")) {
            job.setJar(results.getOptionValue("jar"));
        }
        if (results.hasOption("inputformat")) {
            setIsJavaRecordReader(job, true);
            job.setInputFormat(getClass(results, "inputformat", job, InputFormat.class));
        }
        if (results.hasOption("javareader")) {
            setIsJavaRecordReader(job, true);
        }
        if (results.hasOption("map")) {
            setIsJavaMapper(job, true);
            job.setMapperClass(getClass(results, "map", job, Mapper.class));
        }
        if (results.hasOption("partitioner")) {
            job.setPartitionerClass(getClass(results, "partitioner", job, Partitioner.class));
        }
        if (results.hasOption("reduce")) {
            setIsJavaReducer(job, true);
            job.setReducerClass(getClass(results, "reduce", job, Reducer.class));
        }
        if (results.hasOption("reduces")) {
            job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces")));
        }
        if (results.hasOption("writer")) {
            setIsJavaRecordWriter(job, true);
            job.setOutputFormat(getClass(results, "writer", job, OutputFormat.class));
        }

        if (results.hasOption("lazyOutput")) {
            if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) {
                LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormat().getClass());
            }
        }

        if (results.hasOption("program")) {
            setExecutable(job, results.getOptionValue("program"));
        }
        if (results.hasOption("jobconf")) {
            LOG.warn("-jobconf option is deprecated, please use -D instead.");
            String options = results.getOptionValue("jobconf");
            StringTokenizer tokenizer = new StringTokenizer(options, ",");
            while (tokenizer.hasMoreTokens()) {
                String keyVal = tokenizer.nextToken().trim();
                String[] keyValSplit = keyVal.split("=");
                job.set(keyValSplit[0], keyValSplit[1]);
            }
        }
        // if they gave us a jar file, include it into the class path
        String jarFile = job.getJar();
        if (jarFile != null) {
            final URL[] urls = new URL[] { FileSystem.getLocal(job).pathToFile(new Path(jarFile)).toURL() };
            //FindBugs complains that creating a URLClassLoader should be
            //in a doPrivileged() block. 
            ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() {
                public ClassLoader run() {
                    return new URLClassLoader(urls);
                }
            });
            job.setClassLoader(loader);
        }

        runJob(job);
        return 0;
    } catch (ParseException pe) {
        LOG.info("Error : " + pe);
        cli.printUsage();
        return 1;
    }

}

From source file:ivory.index.BuildIPInvertedIndexDocSorted.java

License:Apache License

@SuppressWarnings("unused")
public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIPInvertedIndexDocSorted.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    int collectionDocCnt = env.readCollectionDocumentCount();

    LOG.info("PowerTool: BuildIPInvertedIndexDocSorted");
    LOG.info(" - IndexPath: " + indexPath);
    LOG.info(" - CollectionName: " + collectionName);
    LOG.info(" - CollectionDocumentCount: " + collectionDocCnt);
    LOG.info(" - NumMapTasks: " + mapTasks);
    LOG.info(" - NumReduceTasks: " + reduceTasks);
    LOG.info(" - MinSplitSize: " + minSplitSize);

    if (!fs.exists(new Path(indexPath))) {
        fs.mkdirs(new Path(indexPath));
    }//from w w  w .j  a va2s  .  co  m

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
        LOG.info("Postings already exist: no indexing will be performed.");
        return 0;
    }

    conf.setJobName("BuildIPInvertedIndex:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    conf.setInt("Ivory.CollectionDocumentCount", collectionDocCnt);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, postingsPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfInts.class);
    conf.setMapOutputValueClass(TermPositions.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PostingsListDocSortedPositional.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);
    conf.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    env.writePostingsType("ivory.data.PostingsListDocSortedPositional");

    return 0;
}

From source file:mapreduce.BigramCount.java

License:Apache License

/**
 * Runs this tool./*from   w  w  w.j  a  v  a  2s.  co m*/
 */
public int run(String[] args) throws Exception {
    if (args.length != 2) {
        printUsage();
        return -1;
    }

    String inputPath = args[0];
    String outputPath = args[1];

    int mapTasks = 1;//Integer.parseInt(args[2]);
    int reduceTasks = 1;//Integer.parseInt(args[3]);

    sLogger.info("Tool: BigramCount");
    sLogger.info(" - input path: " + inputPath);
    sLogger.info(" - output path: " + outputPath);
    sLogger.info(" - number of mappers: " + mapTasks);
    sLogger.info(" - number of reducers: " + reduceTasks);

    JobConf conf = new JobConf(BigramCount.class);
    conf.setJobName("BigramCount");

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    FileInputFormat.setInputPaths(conf, new Path(inputPath));
    FileOutputFormat.setOutputPath(conf, new Path(outputPath));
    FileOutputFormat.setCompressOutput(conf, false);

    /**
     *  Note that these must match the Class arguments given in the mapper 
     */
    conf.setOutputKeyClass(WordPair.class);
    conf.setOutputValueClass(IntWritable.class);

    conf.setMapperClass(MyMapper.class);
    conf.setPartitionerClass(MyPartitioner.class);
    conf.setCombinerClass(MyReducer.class);
    conf.setReducerClass(MyReducer.class);

    // Delete the output directory if it exists already
    Path outputDir = new Path(outputPath);
    FileSystem.get(outputDir.toUri(), conf).delete(outputDir, true);

    long startTime = System.currentTimeMillis();
    JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}