Example usage for org.apache.hadoop.mapred JobConf setJobName

List of usage examples for org.apache.hadoop.mapred JobConf setJobName

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setJobName.

Prototype

public void setJobName(String name) 

Source Link

Document

Set the user-specified job name.

Usage

From source file:io.hops.erasure_coding.MapReduceEncoder.java

License:Apache License

/**
 * create new job conf based on configuration passed.
 *
 * @param conf/* ww  w. j ava 2s.com*/
 * @return
 */
private static JobConf createJobConf(Configuration conf) {
    JobConf jobconf = new JobConf(conf, MapReduceEncoder.class);
    jobName = NAME + " " + dateForm.format(new Date(BaseEncodingManager.now()));
    jobconf.setUser(BaseEncodingManager.JOBUSER);
    jobconf.setJobName(jobName);
    jobconf.setMapSpeculativeExecution(false);
    RaidUtils.parseAndSetOptions(jobconf, SCHEDULER_OPTION_LABEL);

    jobconf.setJarByClass(MapReduceEncoder.class);
    jobconf.setInputFormat(DistRaidInputFormat.class);
    jobconf.setOutputKeyClass(Text.class);
    jobconf.setOutputValueClass(Text.class);

    jobconf.setMapperClass(DistRaidMapper.class);
    jobconf.setNumReduceTasks(0);
    return jobconf;
}

From source file:it.isislab.sof.core.engine.hadoop.sshclient.utils.simulation.executor.SOF.java

License:Apache License

public static void main(String[] args) {

    /**/*ww  w . j  a va  2s  .c  o m*/
     * aids /home/michele/Scrivania/aids netlogo /home/michele/Scrivania/aids/aids.nlogo /home/michele/Scrivania/aids/input.tmp /home/michele/Scrivania/aids/output /home/michele/Scrivania/aids/output.xml false pepp ciao  
     *  
     */

    /*         try {//Runtime.getRuntime().exec("rm -r /home/lizard87/Desktop/mason_test/output");
        Runtime.getRuntime().exec("rm -r /home/michele/Scrivania/aids/output");
       } catch (IOException e) {e.printStackTrace();}*/

    if (args.length < 9 || args.length == 11 || args.length == 12 || args.length >= 15) {

        System.out.println("Usage:");
        System.out.println("java -jar SCUD.jar " + "<simulation_name> " + "<simulation_path_home> "
                + "<simulation_type[mason |netlogo |generic]>" + "<simulation_generic_interpreter_path>"
                + "<simultion_program_path> " + "<simulation_mapper_input_path> "
                + "<simulation_mapper_output_path> " + "<simulation_output_domain_xmlfile> "
                + "<simulation_input_path> " + "<<simulation_rating_path>>" + "<oneshot[one|loop]> "
                + "<author_name> " + "<simulation_description> " + "<path_interpreter_evaluate_file> "
                + "<evaluate_file_path>");
        System.exit(-1);
    }

    Configuration conf = null;
    JobConf job = null;

    String AUTHOR = null;/*author name*/
    String SIMULATION_NAME = null;/*simulation name*/
    String SIMULATION_HOME = null;/*path simulation*/
    String SIM_TYPE = null;/*mason, netlogo, generic*/
    String SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH = null;
    String SIM_EXECUTABLE_SIMULATION_PROGRAM = null; /*executable program *.jar | *.nlogo*/
    String SIM_EXECUTION_INPUT_DATA_MAPPER = null;/*input.data path */
    String SIM_EXECUTION_OUTPUT_MAPPER = null;/*output loop(i) path*/
    String SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = null;/*path of domain file */
    String SIM_EXECUTION_INPUT_XML = null;/*execution input path*/
    boolean ISLOOP = false;/*false[one] | true[loop]*/
    //String DESCRIPTION=null;/*simulations' description*/
    String INTERPRETER_REMOTE_PATH_EVALUATION = null;/*remote program bin path for executing EvalFoo*/
    String EXECUTABLE_RATING_FILE = null;/*path of rating file*/
    String SIM_RATING_PATH = null;

    // aids /home/michele/Scrivania/aids netlogo /home/michele/Scrivania/aids/aids.nlogo /home/michele/Scrivania/aids/input.tmp /home/michele/Scrivania/aids/output /home/michele/Scrivania/aids/domain.xml /home/michele/Scrivania/aids/input loop pepp ciao /usr/bin/python /home/michele/Scrivania/aids/evaluate.py 

    if (args.length == 13) {
        SIMULATION_NAME = args[0];
        SIMULATION_HOME = args[1];
        SIM_TYPE = args[2];
        SIM_EXECUTABLE_SIMULATION_PROGRAM = args[3];
        SIM_EXECUTION_INPUT_DATA_MAPPER = args[4];
        SIM_EXECUTION_OUTPUT_MAPPER = args[5];
        SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[6];
        SIM_EXECUTION_INPUT_XML = args[7];
        SIM_RATING_PATH = args[8];
        ISLOOP = Boolean.parseBoolean(args[9]);
        AUTHOR = args[10];
        //DESCRIPTION=args[11];
        INTERPRETER_REMOTE_PATH_EVALUATION = args[11];
        EXECUTABLE_RATING_FILE = args[12];
        //   System.out.println(DESCRIPTION);
        //System.out.println(INTERPRETER_REMOTE_PATH_EVALUATION);

    }

    else if (args.length == 9) {
        SIMULATION_NAME = args[0];
        SIMULATION_HOME = args[1];
        SIM_TYPE = args[2];
        SIM_EXECUTABLE_SIMULATION_PROGRAM = args[3];
        SIM_EXECUTION_INPUT_DATA_MAPPER = args[4];
        SIM_EXECUTION_OUTPUT_MAPPER = args[5];
        SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[6];
        ISLOOP = Boolean.parseBoolean(args[7]);
        AUTHOR = args[8];
        //DESCRIPTION=args[9];
    }

    else if (args.length == 14) {
        SIMULATION_NAME = args[0];
        SIMULATION_HOME = args[1];
        SIM_TYPE = args[2];
        SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH = args[3];
        SIM_EXECUTABLE_SIMULATION_PROGRAM = args[4];
        SIM_EXECUTION_INPUT_DATA_MAPPER = args[5];
        SIM_EXECUTION_OUTPUT_MAPPER = args[6];
        SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[7];
        SIM_EXECUTION_INPUT_XML = args[8];
        SIM_RATING_PATH = args[9];
        ISLOOP = Boolean.parseBoolean(args[10]);
        AUTHOR = args[11];
        //   DESCRIPTION=args[12];
        INTERPRETER_REMOTE_PATH_EVALUATION = args[12];
        EXECUTABLE_RATING_FILE = args[13];

    }

    else if (args.length == 10) {
        SIMULATION_NAME = args[0];
        SIMULATION_HOME = args[1];
        SIM_TYPE = args[2];
        SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH = args[3];
        SIM_EXECUTABLE_SIMULATION_PROGRAM = args[4];
        SIM_EXECUTION_INPUT_DATA_MAPPER = args[5];
        SIM_EXECUTION_OUTPUT_MAPPER = args[6];
        SIM_DESCRIPTION_OUTPUT_XML_DOMAIN = args[7];
        ISLOOP = Boolean.parseBoolean(args[8]);
        AUTHOR = args[9];
        //   DESCRIPTION=args[10];
    }

    if (!(SIM_TYPE.equalsIgnoreCase("mason") || SIM_TYPE.equalsIgnoreCase("netlogo")
            || SIM_TYPE.equalsIgnoreCase("generic"))) {
        System.exit(-2);
    }

    conf = new Configuration();
    job = new JobConf(conf, SOF.class);
    job.setJobName(SIMULATION_NAME/*SIMULATION NAME*/);
    job.set("simulation.home", SIMULATION_HOME);
    job.set("simulation.name", SIMULATION_NAME);
    job.set("simulation.type", SIM_TYPE);

    if (SIM_TYPE.equalsIgnoreCase("generic")) {
        job.set("simulation.interpreter.genericsim", SIM_EXECUTABLE_SIMULATION_INTERPRETER_PATH);
    }

    job.set("simulation.program.simulation", SIM_EXECUTABLE_SIMULATION_PROGRAM);
    job.set("simulation.executable.input", SIM_EXECUTION_INPUT_DATA_MAPPER);
    job.set("simulation.executable.output", SIM_EXECUTION_OUTPUT_MAPPER);
    job.setBoolean("simulation.executable.mode", ISLOOP);
    //job.set("simulation.executable.mode", ISLOOP);
    job.set("simulation.executable.author", AUTHOR);
    //job.set("simulation.executable.description", DESCRIPTION);
    job.set("simulation.description.output.domain", SIM_DESCRIPTION_OUTPUT_XML_DOMAIN);

    /**
     * GENERA IL .TMP
     * COMMENTA LA LINEA 
     * TEST IN LOCALE 
     * SOLO PER IL LOCALE
     */
    //XmlToText.convertXmlFileToFileText(conf,"/home/lizard87/Desktop/mason_test/input.xml");
    //XmlToText.convertXmlFileToFileText(conf,"/home/lizard87/Desktop/input.xml");
    //XmlToText.convertXmlFileToFileText(conf,"/home/lizard87/Desktop/aids/input.xml");

    if (ISLOOP) {
        job.set("simulation.description.input", SIM_EXECUTION_INPUT_XML);
        job.set("simulation.program.rating", EXECUTABLE_RATING_FILE);
        //job.set("simulation.interpreter.selection", INTERPRETER_REMOTE_PATH_SELECTION);
        job.set("simulation.interpreter.rating", INTERPRETER_REMOTE_PATH_EVALUATION);
        job.set("simulation.executable.loop.rating", SIM_RATING_PATH);
    }

    FileInputFormat.addInputPath(job, new Path(SIM_EXECUTION_INPUT_DATA_MAPPER)/*DIRECTORY INPUT*/);
    FileOutputFormat.setOutputPath(job, new Path(SIM_EXECUTION_OUTPUT_MAPPER));

    if (SIM_TYPE.equalsIgnoreCase("mason")) {
        job.setMapperClass(SOFMapperMason.class);
        job.setReducerClass(SOFReducerMason.class);

    } else if (SIM_TYPE.equalsIgnoreCase("netlogo")) {

        job.setMapperClass(SOFMapperNetLogo.class);
        job.setReducerClass(SOFReducerNetLogo.class);
    } else if (SIM_TYPE.equalsIgnoreCase("generic")) {
        job.setMapperClass(SOFMapperGeneric.class);
        job.setReducerClass(SOFReducerGeneric.class);
    }

    job.setOutputKeyClass(org.apache.hadoop.io.Text.class);
    job.setOutputValueClass(org.apache.hadoop.io.Text.class);

    JobClient jobc;

    try {
        jobc = new JobClient(job);
        System.out.println(jobc + " " + job);
        RunningJob runjob;
        runjob = JobClient.runJob(job);
        while (runjob.getJobStatus().equals(JobStatus.SUCCEEDED)) {
        }
        System.exit(0);
    } catch (IOException e) {

        e.printStackTrace();
    }

}

From source file:Iterator.SpeciesIterDriver2.java

@SuppressWarnings("deprecation")
public static void main(String[] args) {

    int iterationCount = 0;

    while (iterationCount <= 20) {

        System.out.println("Running Iteration - " + iterationCount);
        JobClient client = new JobClient();
        JobConf conf = new JobConf(SpeciesIterDriver2.class);
        conf.setJobName("Species Iter - " + iterationCount);

        // This property is set to generate 5 reducer tasks
        conf.setNumReduceTasks(5);// w  w w .  j  a va 2 s .  c  om
        conf.setOutputKeyClass(Text.class);
        conf.setOutputValueClass(Text.class);

        //output-iterator-0 contains the input data
        FileInputFormat.setInputPaths(conf, new Path("output-iterator-" + iterationCount));
        iterationCount++;
        FileOutputFormat.setOutputPath(conf, new Path("output-iterator-" + iterationCount));

        conf.setMapperClass(SpeciesIterMapper2.class);
        conf.setReducerClass(SpeciesIterReducer2.class);
        conf.setCombinerClass(SpeciesIterReducer2.class);

        client.setConf(conf);
        try {
            JobClient.runJob(conf);
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}

From source file:ivory.core.index.MergeGlobalStatsAcrossIndexSegments.java

License:Apache License

public int runTool() throws Exception {

    JobConf conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class);
    FileSystem fs = FileSystem.get(conf);

    String collectionName = conf.get("Ivory.CollectionName");
    String indexPaths = conf.get("Ivory.IndexPaths");
    String dataOutputPath = conf.get("Ivory.DataOutputPath");
    int dfThreshold = conf.getInt("Ivory.DfThreshold", 0);

    // first, compute size of global term space
    Path tmpPaths = new Path("/tmp/index-paths.txt");

    FSDataOutputStream out = fs.create(tmpPaths, true);
    for (String s : indexPaths.split(",")) {
        out.write(new String(s + "\n").getBytes());
    }// w w w. j  a v  a2s.  co  m
    out.close();

    LOG.info("Job: ComputeNumberOfTermsAcrossIndexSegments");
    conf.setJobName("ComputeNumberOfTermsAcrossIndexSegments:" + collectionName);

    FileInputFormat.addInputPath(conf, tmpPaths);

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(NLineInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfIntLong.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(IdentityReducer.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    Counters counters = job.getCounters();

    long totalNumTerms = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", 6, "REDUCE_INPUT_GROUPS")
            .getCounter();

    LOG.info("total number of terms in global dictionary = " + totalNumTerms);

    // now build the dictionary
    fs.delete(new Path(dataOutputPath), true);

    conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class);

    LOG.info("Job: MergeGlobalStatsAcrossIndexSegments");
    conf.setJobName("MergeGlobalStatsAcrossIndexSegments:" + collectionName);

    FileInputFormat.addInputPath(conf, tmpPaths);

    conf.setNumMapTasks(1);
    conf.setNumReduceTasks(1);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(NLineInputFormat.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(PairOfIntLong.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);

    conf.setLong("Ivory.IndexNumberOfTerms", (int) totalNumTerms);

    startTime = System.currentTimeMillis();
    job = JobClient.runJob(conf);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    // compute some # docs, collection length, avg doc length
    long collectionLength = 0;
    int docCount = 0;
    for (String index : indexPaths.split(",")) {
        LOG.info("reading stats for " + index);

        RetrievalEnvironment env = new RetrievalEnvironment(index, fs);

        long l = env.readCollectionLength();
        int n = env.readCollectionDocumentCount();

        LOG.info(" - CollectionLength: " + l);
        LOG.info(" - CollectionDocumentCount: " + n);

        collectionLength += l;
        docCount += n;
    }

    float avgdl = (float) collectionLength / docCount;

    LOG.info("all index segments: ");
    LOG.info(" - CollectionLength: " + collectionLength);
    LOG.info(" - CollectionDocumentCount: " + docCount);
    LOG.info(" - AverageDocumentLenght: " + avgdl);

    RetrievalEnvironment env = new RetrievalEnvironment(dataOutputPath, fs);

    env.writeCollectionAverageDocumentLength(avgdl);
    env.writeCollectionLength(collectionLength);
    env.writeCollectionDocumentCount(docCount);

    return 0;
}

From source file:ivory.core.preprocess.BuildTargetLangWeightedIntDocVectors.java

License:Apache License

@SuppressWarnings("deprecation")
public int runTool() throws Exception {
    //      sLogger.setLevel(Level.DEBUG);

    sLogger.info("PowerTool: GetTargetLangWeightedIntDocVectors");

    JobConf conf = new JobConf(BuildTargetLangWeightedIntDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = getConf().get("Ivory.IndexPath");

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String outputPath = env.getWeightedIntDocVectorsDirectory();
    int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0);
    String collectionName = getConf().get("Ivory.CollectionName");

    sLogger.info("Characteristics of the collection:");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info("Characteristics of the job:");
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - MinSplitSize: " + minSplitSize);

    String vocabFile = getConf().get("Ivory.FinalVocab");
    DistributedCache.addCacheFile(new URI(vocabFile), conf);

    Path inputPath = new Path(PwsimEnvironment.getFileNameWithPars(indexPath, "TermDocs"));
    Path weightedVectorsPath = new Path(outputPath);

    if (fs.exists(weightedVectorsPath)) {
        sLogger.info("Output path already exists!");
        return -1;
    }//ww  w  . j a v  a2  s  .  c om
    conf.setJobName("GetWeightedIntDocVectors:" + collectionName);
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false));
    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, weightedVectorsPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(WeightedIntDocVector.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(WeightedIntDocVector.class);

    conf.setMapperClass(MyMapper.class);

    long startTime = System.currentTimeMillis();

    RunningJob rj = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");
    Counters counters = rj.getCounters();

    long numOfDocs = (long) counters.findCounter(Docs.Total).getCounter();

    return (int) numOfDocs;
}

From source file:ivory.core.preprocess.BuildWeightedIntDocVectors.java

License:Apache License

@SuppressWarnings("deprecation")
public int runTool() throws Exception {
    sLogger.setLevel(Level.WARN);

    sLogger.info("PowerTool: GetWeightedIntDocVectors");

    // create a new JobConf, inheriting from the configuration of this
    // PowerTool/*from  w  ww.j  ava 2 s.co m*/
    JobConf conf = new JobConf(getConf(), BuildWeightedIntDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String outputPath = env.getWeightedIntDocVectorsDirectory();
    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    String collectionName = conf.get("Ivory.CollectionName");

    sLogger.info("Characteristics of the collection:");
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info("Characteristics of the job:");
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info(" - MinSplitSize: " + minSplitSize);

    String dfByIntFilePath = env.getDfByIntData();
    String cfByIntFilePath = env.getCfByIntData();

    /* add df table to cache */
    if (!fs.exists(new Path(dfByIntFilePath))) {
        throw new RuntimeException("Error, df data file " + dfByIntFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(dfByIntFilePath), conf);

    /* add cf table to cache */
    if (!fs.exists(new Path(cfByIntFilePath))) {
        throw new RuntimeException("Error, cf data file " + cfByIntFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(cfByIntFilePath), conf);

    /* add dl table to cache */
    Path docLengthFile = env.getDoclengthsData();
    if (!fs.exists(docLengthFile)) {
        throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!");
    }
    DistributedCache.addCacheFile(docLengthFile.toUri(), conf);

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path weightedVectorsPath = new Path(outputPath);

    if (fs.exists(weightedVectorsPath)) {
        sLogger.info("Output path already exists!");
        return 0;
    }

    //fs.delete(weightedVectirsPath, true);

    conf.setJobName("GetWeightedIntDocVectors:" + collectionName);
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, weightedVectorsPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(WeightedIntDocVector.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(WeightedIntDocVector.class);

    conf.setMapperClass(MyMapper.class);
    //conf.setInt("mapred.task.timeout",3600000);

    long startTime = System.currentTimeMillis();

    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:ivory.core.preprocess.BuildWeightedTermDocVectors.java

License:Apache License

@SuppressWarnings("deprecation")
public int runTool() throws Exception {
    sLogger.info("PowerTool: GetWeightedTermDocVectors");

    JobConf conf = new JobConf(BuildWeightedTermDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = getConf().get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String outputPath = env.getWeightedTermDocVectorsDirectory();
    int mapTasks = getConf().getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = getConf().getInt("Ivory.MinSplitSize", 0);
    String collectionName = getConf().get("Ivory.CollectionName");

    String termsFilePath = env.getIndexTermsData();
    String termsIdsFilePath = env.getIndexTermIdsData();
    String termIdMappingFilePath = env.getIndexTermIdMappingData();
    String dfByTermFilePath = env.getDfByTermData();

    Path inputPath = new Path(env.getTermDocVectorsDirectory());
    Path weightedVectorsPath = new Path(outputPath);

    if (fs.exists(weightedVectorsPath)) {
        //fs.delete(weightedVectorsPath, true);
        sLogger.info("Output path already exists!");
        return 0;
    }/*from  w w w  . j  av a  2  s  . com*/

    /* add terms file to cache */
    if (!fs.exists(new Path(termsFilePath)) || !fs.exists(new Path(termsIdsFilePath))
            || !fs.exists(new Path(termIdMappingFilePath))) {
        throw new RuntimeException("Error, terms file " + termsFilePath + "/" + termsIdsFilePath + "/"
                + termIdMappingFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(termsFilePath), conf);
    DistributedCache.addCacheFile(new URI(termsIdsFilePath), conf);
    DistributedCache.addCacheFile(new URI(termIdMappingFilePath), conf);

    /* add df table to cache */
    if (!fs.exists(new Path(dfByTermFilePath))) {
        throw new RuntimeException("Error, df data file " + dfByTermFilePath + "doesn't exist!");
    }
    DistributedCache.addCacheFile(new URI(dfByTermFilePath), conf);

    /* add dl table to cache */
    Path docLengthFile = env.getDoclengthsData();
    if (!fs.exists(docLengthFile)) {
        throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!");
    }
    DistributedCache.addCacheFile(docLengthFile.toUri(), conf);

    conf.setMapperClass(MyMapper.class);
    //conf.setInt("mapred.task.timeout",3600000);
    conf.setJobName("GetWeightedTermDocVectors:" + collectionName);
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);
    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");
    conf.setInt("Ivory.MinNumTerms", getConf().getInt("Ivory.MinNumTerms", Integer.MAX_VALUE));
    conf.setBoolean("Ivory.Normalize", getConf().getBoolean("Ivory.Normalize", false));
    if (getConf().get("Ivory.ShortDocLengths") != null) {
        conf.set("Ivory.ShortDocLengths", getConf().get("Ivory.ShortDocLengths"));
    }
    conf.set("Ivory.ScoringModel", getConf().get("Ivory.ScoringModel"));

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, weightedVectorsPath);
    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(HMapSFW.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(HMapSFW.class);

    sLogger.info("Running job: " + conf.getJobName());

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}

From source file:ivory.index.BuildIntPostingsForwardIndex.java

License:Apache License

public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIntPostingsForwardIndex.class);
    FileSystem fs = FileSystem.get(conf);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    String indexPath = conf.get("Ivory.IndexPath");

    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);
    String collectionName = env.readCollectionName();

    sLogger.info("Tool: BuildIntPostingsForwardIndex");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);

    conf.setJobName("BuildIntPostingsForwardIndex:" + collectionName);

    Path inputPath = new Path(env.getPostingsDirectory());
    FileInputFormat.setInputPaths(conf, inputPath);

    Path postingsIndexPath = new Path(env.getPostingsIndexData());

    if (fs.exists(postingsIndexPath)) {
        sLogger.info("Postings forward index path already exists!");
        return 0;
    }/*from ww w.  ja  va2 s.c  o m*/
    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(1);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(Text.class);
    conf.setOutputKeyClass(Text.class);
    conf.setOutputValueClass(Text.class);
    conf.setOutputFormat(NullOutputFormat.class);

    conf.setMapRunnerClass(MyMapRunner.class);
    conf.setReducerClass(MyReducer.class);

    JobClient.runJob(conf);

    return 0;
}

From source file:ivory.index.BuildIPInvertedIndexDocSorted.java

License:Apache License

@SuppressWarnings("unused")
public int runTool() throws Exception {
    JobConf conf = new JobConf(getConf(), BuildIPInvertedIndexDocSorted.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    String collectionName = env.readCollectionName();

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);
    int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 0);
    int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0);
    int collectionDocCnt = env.readCollectionDocumentCount();

    LOG.info("PowerTool: BuildIPInvertedIndexDocSorted");
    LOG.info(" - IndexPath: " + indexPath);
    LOG.info(" - CollectionName: " + collectionName);
    LOG.info(" - CollectionDocumentCount: " + collectionDocCnt);
    LOG.info(" - NumMapTasks: " + mapTasks);
    LOG.info(" - NumReduceTasks: " + reduceTasks);
    LOG.info(" - MinSplitSize: " + minSplitSize);

    if (!fs.exists(new Path(indexPath))) {
        fs.mkdirs(new Path(indexPath));
    }// ww  w .  j  a  v a 2 s  . c  o m

    Path inputPath = new Path(env.getIntDocVectorsDirectory());
    Path postingsPath = new Path(env.getPostingsDirectory());

    if (fs.exists(postingsPath)) {
        LOG.info("Postings already exist: no indexing will be performed.");
        return 0;
    }

    conf.setJobName("BuildIPInvertedIndex:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(reduceTasks);

    conf.setInt("Ivory.CollectionDocumentCount", collectionDocCnt);

    conf.setInt("mapred.min.split.size", minSplitSize);
    conf.set("mapred.child.java.opts", "-Xmx2048m");

    FileInputFormat.setInputPaths(conf, inputPath);
    FileOutputFormat.setOutputPath(conf, postingsPath);

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);

    conf.setMapOutputKeyClass(PairOfInts.class);
    conf.setMapOutputValueClass(TermPositions.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(PostingsListDocSortedPositional.class);

    conf.setMapperClass(MyMapper.class);
    conf.setReducerClass(MyReducer.class);
    conf.setPartitionerClass(MyPartitioner.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    env.writePostingsType("ivory.data.PostingsListDocSortedPositional");

    return 0;
}

From source file:ivory.preprocess.BuildIntDocVectors.java

License:Apache License

@SuppressWarnings("unused")
public int runTool() throws Exception {
    // create a new JobConf, inheriting from the configuration of this
    // PowerTool/*w  w w .j a v a2s .  com*/
    JobConf conf = new JobConf(getConf(), BuildIntDocVectors.class);
    FileSystem fs = FileSystem.get(conf);

    String indexPath = conf.get("Ivory.IndexPath");
    RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs);

    int mapTasks = conf.getInt("Ivory.NumMapTasks", 0);

    String collectionName = env.readCollectionName();

    sLogger.info("PowerTool: BuildIntDocVectors");
    sLogger.info(" - IndexPath: " + indexPath);
    sLogger.info(" - CollectionName: " + collectionName);
    sLogger.info(" - NumMapTasks: " + mapTasks);
    sLogger.info("This is new!");
    String termsFile = env.getIndexTermsData();
    String termIDsFile = env.getIndexTermIdsData();
    String idToTermFile = env.getIndexTermIdMappingData();

    Path termsFilePath = new Path(termsFile);
    Path termIDsFilePath = new Path(termIDsFile);

    if (!fs.exists(termsFilePath) || !fs.exists(termIDsFilePath)) {
        sLogger.error("Error, terms files don't exist!");
        return 0;
    }

    Path outputPath = new Path(env.getIntDocVectorsDirectory());
    if (fs.exists(outputPath)) {
        sLogger.info("IntDocVectors already exist: skipping!");
        return 0;
    }

    DistributedCache.addCacheFile(new URI(termsFile), conf);
    DistributedCache.addCacheFile(new URI(termIDsFile), conf);
    DistributedCache.addCacheFile(new URI(idToTermFile), conf);

    conf.setJobName("BuildIntDocVectors:" + collectionName);

    conf.setNumMapTasks(mapTasks);
    conf.setNumReduceTasks(0);

    FileInputFormat.setInputPaths(conf, env.getTermDocVectorsDirectory());
    FileOutputFormat.setOutputPath(conf, outputPath);

    conf.set("mapred.child.java.opts", "-Xmx2048m");

    conf.setInputFormat(SequenceFileInputFormat.class);
    conf.setOutputFormat(SequenceFileOutputFormat.class);
    SequenceFileOutputFormat.setOutputCompressionType(conf, SequenceFile.CompressionType.RECORD);

    conf.setMapOutputKeyClass(IntWritable.class);
    conf.setMapOutputValueClass(LazyIntDocVector.class);
    conf.setOutputKeyClass(IntWritable.class);
    conf.setOutputValueClass(LazyIntDocVector.class);

    conf.setMapperClass(MyMapper.class);

    long startTime = System.currentTimeMillis();
    RunningJob job = JobClient.runJob(conf);
    sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds");

    return 0;
}