Example usage for org.apache.hadoop.mapred JobConf setReducerClass

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setReducerClass.

Prototype

public void setReducerClass(Class<? extends Reducer> theClass)

Source Link

Document

Set the Reducer class for the job.

Usage

From source file:it.crs4.pydoop.pipes.Submitter.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    CommandLineParser cli = new CommandLineParser();
    if (args.length == 0) {
        cli.printUsage();//from w w w  . j  a  va 2  s .c  o m
        return 1;
    }
    cli.addOption("input", false, "input path to the maps", "path");
    cli.addOption("output", false, "output path from the reduces", "path");

    cli.addOption("jar", false, "job jar file", "path");
    cli.addOption("inputformat", false, "java classname of InputFormat", "class");
    //cli.addArgument("javareader", false, "is the RecordReader in Java");
    cli.addOption("map", false, "java classname of Mapper", "class");
    cli.addOption("partitioner", false, "java classname of Partitioner", "class");
    cli.addOption("reduce", false, "java classname of Reducer", "class");
    cli.addOption("writer", false, "java classname of OutputFormat", "class");
    cli.addOption("program", false, "URI to application executable", "class");
    cli.addOption("reduces", false, "number of reduces", "num");
    cli.addOption("jobconf", false,
            "\"n1=v1,n2=v2,..\" (Deprecated) Optional. Add or override a JobConf property.", "key=val");
    cli.addOption("lazyOutput", false, "Optional. Create output lazily", "boolean");
    Parser parser = cli.createParser();
    try {

        GenericOptionsParser genericParser = new GenericOptionsParser(getConf(), args);
        CommandLine results = parser.parse(cli.options, genericParser.getRemainingArgs());

        JobConf job = new JobConf(getConf());

        if (results.hasOption("input")) {
            FileInputFormat.setInputPaths(job, results.getOptionValue("input"));
        }
        if (results.hasOption("output")) {
            FileOutputFormat.setOutputPath(job, new Path(results.getOptionValue("output")));
        }
        if (results.hasOption("jar")) {
            job.setJar(results.getOptionValue("jar"));
        }
        if (results.hasOption("inputformat")) {
            setIsJavaRecordReader(job, true);
            job.setInputFormat(getClass(results, "inputformat", job, InputFormat.class));
        }
        if (results.hasOption("javareader")) {
            setIsJavaRecordReader(job, true);
        }
        if (results.hasOption("map")) {
            setIsJavaMapper(job, true);
            job.setMapperClass(getClass(results, "map", job, Mapper.class));
        }
        if (results.hasOption("partitioner")) {
            job.setPartitionerClass(getClass(results, "partitioner", job, Partitioner.class));
        }
        if (results.hasOption("reduce")) {
            setIsJavaReducer(job, true);
            job.setReducerClass(getClass(results, "reduce", job, Reducer.class));
        }
        if (results.hasOption("reduces")) {
            job.setNumReduceTasks(Integer.parseInt(results.getOptionValue("reduces")));
        }
        if (results.hasOption("writer")) {
            setIsJavaRecordWriter(job, true);
            job.setOutputFormat(getClass(results, "writer", job, OutputFormat.class));
        }

        if (results.hasOption("lazyOutput")) {
            if (Boolean.parseBoolean(results.getOptionValue("lazyOutput"))) {
                LazyOutputFormat.setOutputFormatClass(job, job.getOutputFormat().getClass());
            }
        }

        if (results.hasOption("program")) {
            setExecutable(job, results.getOptionValue("program"));
        }
        if (results.hasOption("jobconf")) {
            LOG.warn("-jobconf option is deprecated, please use -D instead.");
            String options = results.getOptionValue("jobconf");
            StringTokenizer tokenizer = new StringTokenizer(options, ",");
            while (tokenizer.hasMoreTokens()) {
                String keyVal = tokenizer.nextToken().trim();
                String[] keyValSplit = keyVal.split("=");
                job.set(keyValSplit[0], keyValSplit[1]);
            }
        }
        // if they gave us a jar file, include it into the class path
        String jarFile = job.getJar();
        if (jarFile != null) {
            final URL[] urls = new URL[] { FileSystem.getLocal(job).pathToFile(new Path(jarFile)).toURL() };
            //FindBugs complains that creating a URLClassLoader should be
            //in a doPrivileged() block. 
            ClassLoader loader = AccessController.doPrivileged(new PrivilegedAction<ClassLoader>() {
                public ClassLoader run() {
                    return new URLClassLoader(urls);
                }
            });
            job.setClassLoader(loader);
        }

        runJob(job);
        return 0;
    } catch (ParseException pe) {
        LOG.info("Error : " + pe);
        cli.printUsage();
        return 1;
    }

}

From source file:it.isislab.sof.core.engine.hadoop.sshclient.utils.simulation.executor.SOF.java

License:Apache License

public static void main(String[] args) {

    /**/*from   ww  w  .  j  a  v a2  s.co  m*/
     * aids /home/michele/Scrivania/aids netlogo /home/michele/Scrivania/aids/aids.nlogo /home/michele/Scrivania/aids/input.tmp /home/michele/Scrivania/aids/output /home/michele/Scrivania/aids/output.xml false pepp ciao  
     *  
     */

    /*         try {//Runtime.getRuntime().exec("rm -r /home/lizard87/Desktop/mason_test/output");
        Runtime.getRuntime().exec("rm -r /home/michele/Scrivania/aids/output");
       } catch (IOException e) {e.printStackTrace();}*/

    if (args.length < 9 || args.length == 11 || args.length == 12 || args.length >= 15) {

        System.out.println("Usage:");
        System.out.println("java -jar SCUD.jar " + "<simulation_name> " + "<simulation_path_home> "
                + "<simulation_type[mason |netlogo |generic]>" + "<simulation_generic_interpreter_path>"
                + "<simultion_program_path> " + "<simulation_mapper_input_path> "
                + "<simulation_mapper_output_path> " + "<simulation_output_domain_xmlfile> "
                + "<simulation_input_path> " + "<<simulation_rating_path>>" + "<oneshot[one|loop]> "
                + "<author_name> " + "<simulation_description> " + "<path_interpreter_evaluate_file> "
                + "<evaluate_file_path>");
        System.exit(-1);
    }

    Configuration conf = null;
    JobConf job = null;

    String AUTHOR = null;/*author name*/
    String SIMULATION_NAME = null;/*simulation name*/
    String SIMULATION_HOME = null;/*path simulation*/
    String SIM_TYPE = null;/*mason, netlogo, generic*/
    String SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH = null;
    String SIM_EXECUTABLE_SIMULATION_PROGRAM = null; /*executable program *.jar | *.nlogo*/
    String SIM_EXECUTION_INPUT_DATA_MAPPER = null;/*input.data path */
    String SIM_EXECUTION_OUTPUT_MAPPER = null;/*output loop(i) path*/
    String SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = null;/*path of domain file */
    String SIM_EXECUTION_INPUT_XML = null;/*execution input path*/
    boolean ISLOOP = false;/*false[one] | true[loop]*/
    //String DESCRIPTION=null;/*simulations' description*/
    String INTERPRETER_REMOTE_PATH_EVALUATION = null;/*remote program bin path for executing EvalFoo*/
    String EXECUTABLE_RATING_FILE = null;/*path of rating file*/
    String SIM_RATING_PATH = null;

    // aids /home/michele/Scrivania/aids netlogo /home/michele/Scrivania/aids/aids.nlogo /home/michele/Scrivania/aids/input.tmp /home/michele/Scrivania/aids/output /home/michele/Scrivania/aids/domain.xml /home/michele/Scrivania/aids/input loop pepp ciao /usr/bin/python /home/michele/Scrivania/aids/evaluate.py 

    if (args.length == 13) {
        SIMULATION_NAME = args[0];
        SIMULATION_HOME = args[1];
        SIM_TYPE = args[2];
        SIM_EXECUTABLE_SIMULATION_PROGRAM = args[3];
        SIM_EXECUTION_INPUT_DATA_MAPPER = args[4];
        SIM_EXECUTION_OUTPUT_MAPPER = args[5];
        SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[6];
        SIM_EXECUTION_INPUT_XML = args[7];
        SIM_RATING_PATH = args[8];
        ISLOOP = Boolean.parseBoolean(args[9]);
        AUTHOR = args[10];
        //DESCRIPTION=args[11];
        INTERPRETER_REMOTE_PATH_EVALUATION = args[11];
        EXECUTABLE_RATING_FILE = args[12];
        //   System.out.println(DESCRIPTION);
        //System.out.println(INTERPRETER_REMOTE_PATH_EVALUATION);

    }

    else if (args.length == 9) {
        SIMULATION_NAME = args[0];
        SIMULATION_HOME = args[1];
        SIM_TYPE = args[2];
        SIM_EXECUTABLE_SIMULATION_PROGRAM = args[3];
        SIM_EXECUTION_INPUT_DATA_MAPPER = args[4];
        SIM_EXECUTION_OUTPUT_MAPPER = args[5];
        SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[6];
        ISLOOP = Boolean.parseBoolean(args[7]);
        AUTHOR = args[8];
        //DESCRIPTION=args[9];
    }

    else if (args.length == 14) {
        SIMULATION_NAME = args[0];
        SIMULATION_HOME = args[1];
        SIM_TYPE = args[2];
        SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH = args[3];
        SIM_EXECUTABLE_SIMULATION_PROGRAM = args[4];
        SIM_EXECUTION_INPUT_DATA_MAPPER = args[5];
        SIM_EXECUTION_OUTPUT_MAPPER = args[6];
        SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[7];
        SIM_EXECUTION_INPUT_XML = args[8];
        SIM_RATING_PATH = args[9];
        ISLOOP = Boolean.parseBoolean(args[10]);
        AUTHOR = args[11];
        //   DESCRIPTION=args[12];
        INTERPRETER_REMOTE_PATH_EVALUATION = args[12];
        EXECUTABLE_RATING_FILE = args[13];

    }

    else if (args.length == 10) {
        SIMULATION_NAME = args[0];
        SIMULATION_HOME = args[1];
        SIM_TYPE = args[2];
        SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH = args[3];
        SIM_EXECUTABLE_SIMULATION_PROGRAM = args[4];
        SIM_EXECUTION_INPUT_DATA_MAPPER = args[5];
        SIM_EXECUTION_OUTPUT_MAPPER = args[6];
        SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[7];
        ISLOOP = Boolean.parseBoolean(args[8]);
        AUTHOR = args[9];
        //   DESCRIPTION=args[10];
    }

    if (!(SIM_TYPE.equalsIgnoreCase("mason") || SIM_TYPE.equalsIgnoreCase("netlogo")
            || SIM_TYPE.equalsIgnoreCase("generic"))) {
        System.exit(-2);
    }

    conf = new Configuration();
    job = new JobConf(conf, SOF.class);
    job.setJobName(SIMULATION_NAME/*SIMULATION NAME*/);
    job.set("simulation.home", SIMULATION_HOME);
    job.set("simulation.name", SIMULATION_NAME);
    job.set("simulation.type", SIM_TYPE);

    if (SIM_TYPE.equalsIgnoreCase("generic")) {
        job.set("simulation.interpreter.genericsim", SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH);
    }

    job.set("simulation.program.simulation", SIM_EXECUTABLE_SIMULATION_PROGRAM);
    job.set("simulation.executable.input", SIM_EXECUTION_INPUT_DATA_MAPPER);
    job.set("simulation.executable.output", SIM_EXECUTION_OUTPUT_MAPPER);
    job.setBoolean("simulation.executable.mode", ISLOOP);
    //job.set("simulation.executable.mode", ISLOOP);
    job.set("simulation.executable.author", AUTHOR);
    //job.set("simulation.executable.description", DESCRIPTION);
    job.set("simulation.description.output.domain", SIM_DESCRIPTION_OUTPUT_XML_DOMAIN);

    /**
     * GENERA IL .TMP
     * COMMENTA LA LINEA 
     * TEST IN LOCALE 
     * SOLO PER IL LOCALE
     */
    //XmlToText.convertXmlFileToFileText(conf,"/home/lizard87/Desktop/mason_test/input.xml");
    //XmlToText.convertXmlFileToFileText(conf,"/home/lizard87/Desktop/input.xml");
    //XmlToText.convertXmlFileToFileText(conf,"/home/lizard87/Desktop/aids/input.xml");

    if (ISLOOP) {
        job.set("simulation.description.input", SIM_EXECUTION_INPUT_XML);
        job.set("simulation.program.rating", EXECUTABLE_RATING_FILE);
        //job.set("simulation.interpreter.selection", INTERPRETER_REMOTE_PATH_SELECTION);
        job.set("simulation.interpreter.rating", INTERPRETER_REMOTE_PATH_EVALUATION);
        job.set("simulation.executable.loop.rating", SIM_RATING_PATH);
    }

    FileInputFormat.addInputPath(job, new Path(SIM_EXECUTION_INPUT_DATA_MAPPER)/*DIRECTORY INPUT*/);
    FileOutputFormat.setOutputPath(job, new Path(SIM_EXECUTION_OUTPUT_MAPPER));

    if (SIM_TYPE.equalsIgnoreCase("mason")) {
        job.setMapperClass(SOFMapperMason.class);
        job.setReducerClass(SOFReducerMason.class);

    } else if (SIM_TYPE.equalsIgnoreCase("netlogo")) {

        job.setMapperClass(SOFMapperNetLogo.class);
        job.setReducerClass(SOFReducerNetLogo.class);
    } else if (SIM_TYPE.equalsIgnoreCase("generic")) {
        job.setMapperClass(SOFMapperGeneric.class);
        job.setReducerClass(SOFReducerGeneric.class);
    }

    job.setOutputKeyClass(org.apache.hadoop.io.Text.class);
    job.setOutputValueClass(org.apache.hadoop.io.Text.class);

    JobClient jobc;

    try {
        jobc = new JobClient(job);
        System.out.println(jobc + " " + job);
        RunningJob runjob;
        runjob = JobClient.runJob(job);
        while (runjob.getJobStatus().equals(JobStatus.SUCCEEDED)) {
        }
        System.exit(0);
    } catch (IOException e) {

        e.printStackTrace();
    }

}

From source file:Iterator.SpeciesIterDriver2.java

@SuppressWarnings("deprecation")
public static void main(String[] args) {

    int iterationCount = 0;

    while (iterationCount <= 20) {

        System.out.println("Running Iteration - " + iterationCount);
        JobClient client = new JobClient();
        JobConf conf = new JobConf(SpeciesIterDriver2.class);
        conf.setJobName("Species Iter - " + iterationCount);

        // This property is set to generate 5 reducer tasks
        conf.setNumReduceTasks(5);/* w ww . j  av a  2  s.  c o m*/
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);

        //output-iterator-0 contains the input data
        FileInputFormat.setInputPaths(conf, new Path("output-iterator-" + iterationCount));
        iterationCount++;
        FileOutputFormat.setOutputPath(conf, new Path("output-iterator-" + iterationCount));

        conf.setMapperClass(SpeciesIterMapper2.class);
        conf.setReducerClass(SpeciesIterReducer2.class);
        conf.setCombinerClass(SpeciesIterReducer2.class);

        client.setConf(conf);
        try {
            JobClient.runJob(conf);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

From source file:ivory.core.index.MergeGlobalStatsAcrossIndexSegments.java

License:Apache License

public int runTool() throws Exception {

    JobConf conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class);
    FileSystem fs = FileSystem.get(conf);

    String collectionName = conf.get("Ivory.CollectionName");
    String indexPaths = conf.get("Ivory.IndexPaths");
    String dataOutputPath = conf.get("Ivory.DataOutputPath");
    int dfThreshold = conf.getInt("Ivory.DfThreshold", 0);

    // first, compute size of global term space
    Path tmpPaths = new Path("/tmp/index-paths.txt");

    FSDataOutputStream out = fs.create(tmpPaths, true);
    for (String s : indexPaths.split(",")) {
        out.write(new String(s + "\n").getBytes());
    }//from  w ww  . ja  va2s .c om
    out.close();

    LOG.info("Job: ComputeNumberOfTermsAcrossIndexSegments");
    conf.setJobName("ComputeNumberOfTermsAcrossIndexSegments:" + collectionName);

    FileInputFormat.addInputPath(conf, tmpPaths);

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(NLineInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfIntLong.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    Counters counters = job.getCounters();

    long totalNumTerms = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", 6, "REDUCE_INPUT_GROUPS")
            .getCounter();

    LOG.info("total number of terms in global dictionary = " + totalNumTerms);

    // now build the dictionary
    fs.delete(new Path(dataOutputPath), true);

    conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class);

    LOG.info("Job: MergeGlobalStatsAcrossIndexSegments");
    conf.setJobName("MergeGlobalStatsAcrossIndexSegments:" + collectionName);

    FileInputFormat.addInputPath(conf, tmpPaths);

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(NLineInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfIntLong.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    conf.setLong("Ivory.IndexNumberOfTerms", (int) totalNumTerms);

    startTime = System.currentTimeMillis();
    job = JobClient.runJob(conf);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // compute some # docs, collection length, avg doc length
    long collectionLength = 0;
    int docCount = 0;
    for (String index : indexPaths.split(",")) {
        LOG.info("reading stats for " + index);

        RetrievalEnvironment env = new RetrievalEnvironment(index, fs);

        long l = env.readCollectionLength();
        int n = env.readCollectionDocumentCount();

        LOG.info(" - CollectionLength: " + l);
        LOG.info(" - CollectionDocumentCount: " + n);

        collectionLength += l;
        docCount += n;
    }

    float avgdl = (float) collectionLength / docCount;

    LOG.info("all index segments: ");
    LOG.info(" - CollectionLength: " + collectionLength);
    LOG.info(" - CollectionDocumentCount: " + docCount);
    LOG.info(" - AverageDocumentLenght: " + avgdl);

    RetrievalEnvironment env = new RetrievalEnvironment(dataOutputPath, fs);

    env.writeCollectionAverageDocumentLength(avgdl);
    env.writeCollectionLength(collectionLength);
    env.writeCollectionDocumentCount(docCount);

    return 0;
}

From source file:ivory.index.BuildIntPostingsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIntPostingsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    String indexPath = conf.get("Ivory.IndexPath");

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String collectionName = env.readCollectionName();

    sLogger.info("Tool: BuildIntPostingsForwardIndex");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);

    conf.setJobName("BuildIntPostingsForwardIndex:" + collectionName);

    Path inputPath = new Path(env.getPostingsDirectory());
    FileInputFormat.setInputPaths(conf, inputPath);

    Path postingsIndexPath = new Path(env.getPostingsIndexData());

    if (fs.exists(postingsIndexPath)) {
        sLogger.info("Postings forward index path already exists!");
        return 0;
    }/*w  ww .ja  v a2 s.co m*/
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:ivory.index.BuildIPInvertedIndexDocSorted.java

License:Apache License

@SuppressWarnings("unused")
public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIPInvertedIndexDocSorted.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    int collectionDocCnt = env.readCollectionDocumentCount();

    LOG.info("PowerTool: BuildIPInvertedIndexDocSorted");
    LOG.info(" - IndexPath: " + indexPath);
    LOG.info(" - CollectionName: " + collectionName);
    LOG.info(" - CollectionDocumentCount: " + collectionDocCnt);
    LOG.info(" - NumMapTasks: " + mapTasks);
    LOG.info(" - NumReduceTasks: " + reduceTasks);
    LOG.info(" - MinSplitSize: " + minSplitSize);

    if (!fs.exists(new Path(indexPath))) {
        fs.mkdirs(new Path(indexPath));
    }//from   ww  w  .  ja  v  a 2 s .  co m

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
        LOG.info("Postings already exist: no indexing will be performed.");
        return 0;
    }

    conf.setJobName("BuildIPInvertedIndex:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    conf.setInt("Ivory.CollectionDocumentCount", collectionDocCnt);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, postingsPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfInts.class);
    conf.setMapOutputValueClass(TermPositions.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PostingsListDocSortedPositional.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);
    conf.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    env.writePostingsType("ivory.data.PostingsListDocSortedPositional");

    return 0;
}

From source file:ivory.preprocess.BuildIntDocVectorsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIntDocVectorsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    String collectionName = env.readCollectionName();
    boolean buildWeighted = conf.getBoolean("Ivory.BuildWeighted", false);

    sLogger.info("Tool: BuildIntDocVectorsIndex");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - BuildWeighted: " + buildWeighted);
    sLogger.info(" - NumMapTasks: " + mapTasks);

    String intDocVectorsPath;//from  w  w  w  .j ava  2 s . co m
    String forwardIndexPath;
    if (buildWeighted) {
        intDocVectorsPath = env.getWeightedIntDocVectorsDirectory();
        forwardIndexPath = env.getWeightedIntDocVectorsForwardIndex();
    } else {
        intDocVectorsPath = env.getIntDocVectorsDirectory();
        forwardIndexPath = env.getIntDocVectorsForwardIndex();
    }

    if (!fs.exists(new Path(intDocVectorsPath))) {
        sLogger.info("Error: IntDocVectors don't exist!");
        return 0;
    }

    if (fs.exists(new Path(forwardIndexPath))) {
        sLogger.info("IntDocVectorIndex already exists: skipping!");
        return 0;
    }

    conf.setJobName("BuildIntDocVectorsForwardIndex:" + collectionName);

    Path inputPath = new Path(intDocVectorsPath);
    FileInputFormat.setInputPaths(conf, inputPath);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:ivory.preprocess.BuildTermDocVectorsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildTermDocVectorsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    String collectionName = env.readCollectionName();

    sLogger.info("Tool: BuildTermDocVectorsIndex");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - NumMapTasks: " + mapTasks);

    if (!fs.exists(new Path(env.getTermDocVectorsDirectory()))) {
        sLogger.info("Error: TermDocVectors don't exist!");
        return 0;
    }//from   w  w w.  j av a 2  s.co m

    if (fs.exists(new Path(env.getTermDocVectorsForwardIndex()))) {
        sLogger.info("TermDocVectorIndex already exists: skipping!");
        return 0;
    }

    conf.setJobName("BuildTermDocVectorsForwardIndex:" + collectionName);

    Path inputPath = new Path(env.getTermDocVectorsDirectory());
    FileInputFormat.setInputPaths(conf, inputPath);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:ivory.preprocess.BuildTermIdMap.java

License:Apache License

@SuppressWarnings("unused")
public int runTool() throws Exception {
    // create a new JobConf, inheriting from the configuration of this
    // PowerTool/*ww  w  .  j av  a  2s.  c o  m*/
    JobConf conf = new JobConf(getConf(), BuildTermIdMap.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    String collectionName = conf.get("Ivory.CollectionName");

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int reduceTasks = 1;
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);

    sLogger.info("PowerTool: BuildTermIdMap");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - NumReduceTasks: " + reduceTasks);

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    if (!fs.exists(new Path(indexPath))) {
        sLogger.error("index path doesn't existing: skipping!");
        return 0;
    }

    Path termsFilePath = new Path(env.getIndexTermsData());
    Path termIDsFilePath = new Path(env.getIndexTermIdsData());
    Path idToTermFilePath = new Path(env.getIndexTermIdMappingData());
    Path dfByTermFilePath = new Path(env.getDfByTermData());
    Path cfByTermFilePath = new Path(env.getCfByTermData());
    Path dfByIntFilePath = new Path(env.getDfByIntData());
    Path cfByIntFilePath = new Path(env.getCfByIntData());

    if (fs.exists(termsFilePath) || fs.exists(termIDsFilePath) || fs.exists(idToTermFilePath)
            || fs.exists(dfByTermFilePath) || fs.exists(cfByTermFilePath) || fs.exists(dfByIntFilePath)
            || fs.exists(cfByIntFilePath)) {
        sLogger.info("term and term id data exist: skipping!");
        return 0;
    }

    Path tmpPath = new Path(env.getTempDirectory());
    fs.delete(tmpPath, true);

    conf.setJobName("BuildTermIdMap:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    conf.setInt("Ivory.CollectionTermCount", (int) env.readCollectionTermCount());
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, new Path(env.getTermDfCfDirectory()));
    FileOutputFormat.setOutputPath(conf, tmpPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(PairOfIntLong.class);
    conf.setOutputKeyClass(Text.class);

    conf.setMapperClass(IdentityMapper.class);
    conf.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    fs.delete(tmpPath, true);

    return 0;
}

From source file:ivory.preprocess.GetTermCount.java

License:Apache License

public int runTool() throws Exception {
    // create a new JobConf, inheriting from the configuration of this
    // PowerTool//from   w  ww.  ja va  2s .  c  o m
    JobConf conf = new JobConf(getConf(), GetTermCount.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get(Constants.IndexPath);
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt(Constants.NumMapTasks, 0);
    int reduceTasks = conf.getInt(Constants.NumReduceTasks, 0);

    String collectionName = env.readCollectionName();
    String termDocVectorsPath = env.getTermDocVectorsDirectory();
    String termDfCfPath = env.getTermDfCfDirectory();

    if (!fs.exists(new Path(indexPath))) {
        sLogger.info("index path doesn't existing: skipping!");
        return 0;
    }

    sLogger.info("PowerTool: GetTermCount");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - NumReduceTasks: " + reduceTasks);
    sLogger.info(" - MinDf: " + conf.getInt(Constants.MinDf, 0));
    sLogger.info(" - MaxDf: " + conf.getInt(Constants.MaxDf, Integer.MAX_VALUE));

    Path outputPath = new Path(termDfCfPath);
    if (fs.exists(outputPath)) {
        sLogger.error("TermDfCf directory exist: skipping!");
        return 0;
    }

    conf.setJobName("GetTermCount:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, new Path(termDocVectorsPath));
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(Text.class);
    conf.setMapOutputValueClass(PairOfIntLong.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfIntLong.class);

    conf.setMapperClass(MyMapper.class);
    conf.setCombinerClass(MyCombiner.class);
    conf.setReducerClass(MyReducer.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    Counters counters = job.getCounters();
    // write out number of postings
    int collectionTermCount = (int) counters.findCounter(Statistics.Terms).getCounter();
    env.writeCollectionTermCount(collectionTermCount);
    // NOTE: this value is not the same as number of postings, because
    // postings for non-English terms are discarded, or as result of df cut

    long collectionLength = counters.findCounter(Statistics.SumOfDocLengths).getCounter();
    env.writeCollectionLength(collectionLength);
    return 0;
}