Example usage for org.apache.hadoop.mapreduce.lib.input MultipleInputs addInputPath

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input MultipleInputs addInputPath.

Prototype

@SuppressWarnings("unchecked")
public static void addInputPath(Job job, Path path, Class<? extends InputFormat> inputFormatClass)

Source Link

Document

Add a Path with a custom InputFormat to the list of inputs for the map-reduce job.

Usage

From source file:com.conversantmedia.mapreduce.tool.annotation.handler.MultiInputAnnotationHandler.java

License:Apache License

@Override
public void process(Annotation annotation, Job job, Object target) throws ToolException {
    for (Input input : ((MultiInput) annotation).value()) {
        Path path = getInputAsPath(input.path());
        if (input.mapper() == Mapper.class) {
            MultipleInputs.addInputPath(job, path, input.format());
        } else {//from   w  w w  .  j av  a  2  s  .c o  m
            MultipleInputs.addInputPath(job, path, input.format(), input.mapper());
            // Need to call again here so the call is captured by our aspect which
            // will replace it with the annotated delegating mapper class for resource
            // injection if required.
            job.setMapperClass(DelegatingMapper.class);
        }
    }
}

From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java

License:Apache License

/**
 * Perform A x B, where At and B refer to the paths that contain matrices in
 * {@link SequenceFileInputFormat}. One of At and B must also conform with
 * {@link MapDir} format. Refer to {@link AtB_DMJ} for further details.
 * /* w w w  . java 2  s  . c o m*/
 * @param conf the initial configuration
 * @param mapDirPath path to the matrix in {@link MapDir} format
 * @param matrixInputPaths the list of paths to matrix input partitions over
 *          which we iterate
 * @param matrixOutputPath path to which AxB will be written
 * @param atCols number of columns of At (rows of A)
 * @param bCols
 * @param colsPerPartition cols per partition of the input matrix (whether At or B)
 * @param aIsMapDir is A chosen to be loaded as MapDir
 * @param useInMemCombiner
 * @param numberOfJobs the hint for the desired number of parallel jobs
 * @return the running job
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public Job run(Configuration conf, Path mapDirPath, Path matrixInputPaths, Path matrixOutputPath, int atCols,
        int bCols, int colsPerPartition, boolean aIsMapDir, boolean useInMemCombiner)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    conf.set(MATRIXINMEMORY, mapDirPath.toString());
    conf.setBoolean(AISMAPDIR, aIsMapDir);
    conf.setBoolean(USEINMEMCOMBINER, useInMemCombiner);
    conf.setInt(RESULTROWS, atCols);
    conf.setInt(RESULTCOLS, bCols);
    conf.setInt(PARTITIONCOLS, colsPerPartition);
    FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPaths, "dmj");

    if (useInMemCombiner) {
        Configuration newConf = new Configuration(conf);
        newConf.set("mapreduce.task.io.sort.mb", "1");
        conf = newConf;
    }

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(AtB_DMJ.class);
    job.setJobName(AtB_DMJ.class.getSimpleName());
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    matrixInputPaths = fs.makeQualified(matrixInputPaths);
    MultipleInputs.addInputPath(job, matrixInputPaths, SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    if (!useInMemCombiner)
        job.setCombinerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class);

    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "dmj");
    job.setNumReduceTasks(numReducers);
    // ensures total order (when used with {@link MatrixOutputFormat}),
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, atCols);

    job.setReducerClass(EpsilonReducer.class);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    return job;
}

From source file:com.twitter.algebra.matrix.text.Matrix2TextJob.java

License:Apache License

public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath)
        throws IOException, InterruptedException, ClassNotFoundException {
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(Matrix2TextJob.class);
    job.setJobName(Matrix2TextJob.class.getSimpleName());
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    //    FileInputFormat.addInputPath(job, matrixInputPath);
    MultipleInputs.addInputPath(job, matrixInputPath, SequenceFileInputFormat.class);
    //    job.setInputFormatClass(SequenceFileInputFormat.class);
    TextOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setNumReduceTasks(0);//from w w  w.j av a2  s  . c  om

    job.setOutputFormatClass(TextOutputFormat.class);
    //    job.setOutputKeyClass(IntWritable.class);
    //    job.setOutputValueClass(org.apache.hadoop.io.Text);
    job.setMapperClass(IdMapper.class);

    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.algebra.nmf.ColPartitionJob.java

License:Apache License

/**
 * Partition A on columns, where A refers to the path that contain a matrix in
 * {@link SequenceFileInputFormat}. Refer to {@link ColPartitionJob} for
 * further details./* w w  w . j a  v a 2  s . c o m*/
 * 
 * @param conf the initial configuration
 * @param matrixInputPath the path to the input matrix A
 * @param matrixOutputPath the path of the resulting partitioned matrix
 * @param numInputRows rows
 * @param numInputCols cols
 * @param numColPartitions the hint for the desired number of column
 *          partitions
 * @return the running job
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public Job run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows,
        int numInputCols, int numColPartitions)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf);
    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "colpartition");

    int colPartSize = getColPartitionSize(numInputCols, numColPartitions);
    numColPartitions = (int) Math.ceil(numInputCols / (double) colPartSize);

    if (numReducers < numColPartitions)
        numReducers = numColPartitions;

    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "colpartition");

    conf.setInt(NUM_ORIG_ROWS_KEY, numInputRows);
    conf.setInt(NUM_ORIG_COLS_KEY, numInputCols);
    conf.setInt(NUM_COL_PARTITIONS, numColPartitions);
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(ColPartitionJob.class);
    job.setJobName(ColPartitionJob.class.getSimpleName());

    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    MultipleInputs.addInputPath(job, matrixInputPath, SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(ElementWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    RowColPartitioner.setPartitioner(job, RowColPartitioner.ElementRowColPartitioner.class, numInputRows,
            numInputCols, numColPartitions);

    job.setReducerClass(MyReducer.class);
    job.setNumReduceTasks(numReducers);

    //    job.setOutputFormatClass(SequenceFileOutputFormat.class);
    LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
    return job;
}

From source file:com.twitter.algebra.nmf.CompositeDMJ.java

License:Apache License

public Job run(Configuration conf, Path mapDirPath, Path matrixInputPaths, Path matrixOutputPath, int atCols,
        boolean aIsMapDir, String inMemCStr, int inMemCRows, int inMemCCols, float alpha1, float alpha2)
        throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);
    conf.set(MATRIXINMEMORY, inMemCStr);
    conf.setInt(MATRIXINMEMORYROWS, inMemCRows);
    conf.setInt(MATRIXINMEMORYCOLS, inMemCCols);

    conf.setFloat(ALPHA1, alpha1);//from   w  ww  .  j  ava 2s  .  co m
    conf.setFloat(ALPHA2, alpha2);

    FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPaths, "compositedmj");

    conf.set(MAPDIRMATRIX, mapDirPath.toString());
    conf.setBoolean(AISMAPDIR, aIsMapDir);
    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(CompositeDMJ.class);
    job.setJobName(CompositeDMJ.class.getSimpleName() + "-" + matrixOutputPath.getName());
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    matrixInputPaths = fs.makeQualified(matrixInputPaths);
    MultipleInputs.addInputPath(job, matrixInputPaths, SequenceFileInputFormat.class);

    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    // ensures total order (when used with {@link MatrixOutputFormat}),
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, atCols);

    job.setNumReduceTasks(0);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    return job;
}

From source file:com.twitter.algebra.nmf.ErrDMJ.java

License:Apache License

public Job run(Configuration conf, Path xPath, Path matrixAInputPath, Path ytPath, Path outPath, int aRows,
        int ytRows, int ytCols) throws IOException, InterruptedException, ClassNotFoundException {
    conf = new Configuration(conf);

    conf.set(MAPDIRMATRIXX, xPath.toString());
    conf.set(MAPDIRMATRIXYT, ytPath.toString());
    conf.setInt(YTROWS, ytRows);//from w w  w.j  av a  2 s .c o  m
    conf.setInt(YTCOLS, ytCols);
    FileSystem fs = FileSystem.get(outPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixAInputPath, "err");

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(ErrDMJ.class);
    job.setJobName(ErrDMJ.class.getSimpleName() + "-" + outPath.getName());

    matrixAInputPath = fs.makeQualified(matrixAInputPath);
    MultipleInputs.addInputPath(job, matrixAInputPath, SequenceFileInputFormat.class);

    outPath = fs.makeQualified(outPath);
    FileOutputFormat.setOutputPath(job, outPath);
    job.setMapperClass(MyMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    int numReducers = 1;
    job.setNumReduceTasks(numReducers);
    job.setCombinerClass(SumVectorsReducer.class);
    job.setReducerClass(SumVectorsReducer.class);

    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed! ");
    return job;
}

From source file:io.druid.indexer.path.DatasourcePathSpec.java

License:Apache License

@Override
public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException {
    Preconditions.checkArgument(segments != null && !segments.isEmpty(), "no segments provided");

    logger.info("Found total [%d] segments for [%s]  in interval [%s]", segments.size(),
            ingestionSpec.getDataSource(), ingestionSpec.getInterval());

    DatasourceIngestionSpec updatedIngestionSpec = ingestionSpec;
    if (updatedIngestionSpec.getDimensions() == null) {
        List<String> dims;
        if (config.getParser().getParseSpec().getDimensionsSpec().hasCustomDimensions()) {
            dims = config.getParser().getParseSpec().getDimensionsSpec().getDimensions();
        } else {/*from  w  ww . j  a  va  2  s  . com*/
            Set<String> dimSet = Sets.newHashSet(Iterables.concat(
                    Iterables.transform(segments, new Function<WindowedDataSegment, Iterable<String>>() {
                        @Override
                        public Iterable<String> apply(WindowedDataSegment dataSegment) {
                            return dataSegment.getSegment().getDimensions();
                        }
                    })));
            dims = Lists.newArrayList(Sets.difference(dimSet,
                    config.getParser().getParseSpec().getDimensionsSpec().getDimensionExclusions()));
        }
        updatedIngestionSpec = updatedIngestionSpec.withDimensions(dims);
    }

    if (updatedIngestionSpec.getMetrics() == null) {
        Set<String> metrics = Sets.newHashSet();
        final AggregatorFactory[] cols = config.getSchema().getDataSchema().getAggregators();
        if (cols != null) {
            for (AggregatorFactory col : cols) {
                metrics.add(col.getName());
            }
        }
        updatedIngestionSpec = updatedIngestionSpec.withMetrics(Lists.newArrayList(metrics));
    }

    updatedIngestionSpec = updatedIngestionSpec
            .withQueryGranularity(config.getGranularitySpec().getQueryGranularity());

    job.getConfiguration().set(DatasourceInputFormat.CONF_DRUID_SCHEMA,
            mapper.writeValueAsString(updatedIngestionSpec));
    job.getConfiguration().set(DatasourceInputFormat.CONF_INPUT_SEGMENTS, mapper.writeValueAsString(segments));
    job.getConfiguration().set(DatasourceInputFormat.CONF_MAX_SPLIT_SIZE, String.valueOf(maxSplitSize));
    MultipleInputs.addInputPath(job, new Path("/dummy/tobe/ignored"), DatasourceInputFormat.class);

    return job;
}

From source file:io.druid.indexer.path.StaticPathSpec.java

License:Apache License

public final static void addToMultipleInputs(HadoopDruidIndexerConfig config, Job job, String path,
        Class<? extends InputFormat> inputFormatClass) {
    if (path == null) {
        return;/*w  w w.j av  a 2s.c  o  m*/
    }

    Class<? extends InputFormat> inputFormatClassToUse = inputFormatClass;
    if (inputFormatClassToUse == null) {
        if (config.isCombineText()) {
            inputFormatClassToUse = CombineTextInputFormat.class;
        } else {
            inputFormatClassToUse = TextInputFormat.class;
        }
    }

    // Due to https://issues.apache.org/jira/browse/MAPREDUCE-5061 we can't directly do
    // MultipleInputs.addInputPath(job, path, inputFormatClassToUse)
    // but have to handle hadoop glob path ourselves correctly
    // This change and HadoopGlobPathSplitter.java can be removed once the hadoop issue is fixed
    for (StringBuilder sb : HadoopGlobPathSplitter.splitGlob(path)) {
        MultipleInputs.addInputPath(job, new Path(sb.toString()), inputFormatClassToUse);
    }
}

From source file:org.ankus.mapreduce.algorithms.recommendation.recommender.driver.UserbasedRecommendationDriver.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    if (args.length < 1) {
        Usage.printUsage(Constants.ALGORITHM_USER_BASED_RECOMMENDATION);
        return -1;
    }/*from  w  w  w.ja v  a  2  s  . c  o m*/

    initArguments(args);

    // Get key (midterm.process.output.remove.mode) from config.properties
    Properties configProperties = AnkusUtils.getConfigProperties();
    String removeModeMidtermProcess = configProperties.get(Constants.MIDTERM_PROCESS_OUTPUT_REMOVE_MODE)
            .toString();
    boolean removeMode = false;
    if (removeModeMidtermProcess.equals(Constants.REMOVE_ON)) {
        removeMode = true;
    }

    // Get prepare output path for in the middle of job processing
    String prepareDirectory = AnkusUtils.createDirectoryForHDFS(output);
    String prepareOutput = prepareDirectory + "/";
    fileSystem = FileSystem.get(new Configuration());

    URI fileSystemUri = fileSystem.getUri();
    Path itemListOutputPath = new Path(fileSystemUri + "/" + prepareOutput + "itemlist");
    Path candidateItemListOutput = new Path(fileSystemUri + "/" + prepareOutput + "candidate");
    Path switchSimilarityOutput1 = new Path(fileSystemUri + "/" + prepareOutput + "switchSimilarity1");
    Path switchSimilarityOutput2 = new Path(fileSystemUri + "/" + prepareOutput + "switchSimilarity2");
    Path aggregateSwitchSimOutput = new Path(fileSystemUri + "/" + prepareOutput + "aggregate");
    Path neighborAllDataOutput = new Path(fileSystemUri + "/" + prepareOutput + "neighborhood");

    /**
     * Step 1.
     * Arrange only item list of test data set(base input data set)
     */
    logger.info("==========================================================================================");
    logger.info("   Step 1 of the 7 steps : Arrange only item list for input data set. ");
    logger.info("       Input directory [" + input + "]");
    logger.info("       Output directory [" + itemListOutputPath.toString() + "]");
    logger.info("==========================================================================================");

    Job job1 = new Job();
    job1.setJarByClass(UserbasedRecommendationDriver.class);

    job1.setMapperClass(ItemListMapper.class);
    job1.setReducerClass(ItemListReducer.class);

    job1.setMapOutputKeyClass(Text.class);
    job1.setMapOutputValueClass(NullWritable.class);

    job1.setOutputKeyClass(Text.class);
    job1.setOutputValueClass(NullWritable.class);

    FileInputFormat.setInputPaths(job1, new Path(input));
    FileOutputFormat.setOutputPath(job1, itemListOutputPath);

    job1.getConfiguration().set(Constants.DELIMITER, delimiter);

    boolean step1 = job1.waitForCompletion(true);
    if (!(step1))
        return -1;

    /**
     * Step 1-1.
     * Arrange similar users from similarity data set and movielens data set
     */
    logger.info("==========================================================================================");
    logger.info(
            "   Step 2 of the 7 steps : Arrange similar users from similarity data set and movielens data set. ");
    logger.info("       Input directory [" + similarDataInput + "]");
    logger.info("       Output directory [" + switchSimilarityOutput1.toString() + "]");
    logger.info("==========================================================================================");

    Job job2 = new Job();
    job2.setJarByClass(UserbasedRecommendationDriver.class);

    job2.setMapperClass(Neighborhood1Mapper.class);
    job2.setReducerClass(Neighborhood1Reducer.class);

    job2.setMapOutputKeyClass(Text.class);
    job2.setMapOutputValueClass(Text.class);

    job2.setOutputKeyClass(Text.class);
    job2.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job2, similarDataInput);
    FileOutputFormat.setOutputPath(job2, switchSimilarityOutput1);

    job2.getConfiguration().set(Constants.DELIMITER, delimiter);

    boolean step2 = job2.waitForCompletion(true);
    if (!(step2))
        return -1;

    /**
     * Step 1-2.
     * Opposite arrange similar users from similarity data set and movielens data set
     */
    logger.info("==========================================================================================");
    logger.info(
            "   Step 3 of the 7 steps : Opposite arrange similar users from similarity data set and movielens data set. ");
    logger.info("       Input directory [" + similarDataInput + "]");
    logger.info("       Output directory [" + switchSimilarityOutput2.toString() + "]");
    logger.info("==========================================================================================");

    Job job3 = new Job();
    job3.setJarByClass(UserbasedRecommendationDriver.class);

    job3.setMapperClass(Neighborhood2Mapper.class);
    job3.setReducerClass(Neighborhood2Reducer.class);

    job3.setMapOutputKeyClass(Text.class);
    job3.setMapOutputValueClass(Text.class);

    job3.setOutputKeyClass(Text.class);
    job3.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job3, similarDataInput);
    FileOutputFormat.setOutputPath(job3, switchSimilarityOutput2);

    job3.getConfiguration().set(Constants.DELIMITER, delimiter);

    boolean step3 = job3.waitForCompletion(true);
    if (!(step3))
        return -1;

    /**
     * 1-3 Aggregate two similarity result data set
     */
    logger.info("==========================================================================================");
    logger.info("   Step 4 of the 7 steps : Aggregate step 2 and step 3 result data set. ");
    logger.info("       Multi Input directory 1 [" + switchSimilarityOutput1 + "]");
    logger.info("       Multi Input directory 2 [" + switchSimilarityOutput2 + "]");
    logger.info("       Output directory [" + aggregateSwitchSimOutput.toString() + "]");
    logger.info("==========================================================================================");

    Job job4 = new Job();
    job4.setJarByClass(UserbasedRecommendationDriver.class);

    job4.setMapperClass(AggregateMapper.class);

    job4.setMapOutputKeyClass(NullWritable.class);
    job4.setMapOutputValueClass(Text.class);

    MultipleInputs.addInputPath(job4, switchSimilarityOutput1, TextInputFormat.class);
    MultipleInputs.addInputPath(job4, switchSimilarityOutput2, TextInputFormat.class);
    FileOutputFormat.setOutputPath(job4, aggregateSwitchSimOutput);

    boolean step4 = job4.waitForCompletion(true);
    if (!(step4))
        return -1;

    /**
     * Step 2.
     * Join movielens data set and similarity(neighborhood) user list
     */
    logger.info("==========================================================================================");
    logger.info("   Step 5 of the 7 steps : Join movielens data set and similarity(step 4) user list. ");
    logger.info("       Multi Input directory 1 [" + input + "]");
    logger.info("       Multi Input directory 2 [" + aggregateSwitchSimOutput.toString() + "]");
    logger.info("       Output directory [" + neighborAllDataOutput.toString() + "]");
    logger.info("==========================================================================================");

    Job job5 = new Job();
    job5.setJarByClass(UserbasedRecommendationDriver.class);

    job5.setReducerClass(NeighborhoodReducer.class);

    job5.setMapOutputKeyClass(Text.class);
    job5.setMapOutputValueClass(Text.class);

    job5.setOutputKeyClass(Text.class);
    job5.setOutputValueClass(Text.class);

    MultipleInputs.addInputPath(job5, new Path(input), TextInputFormat.class, MovielensMapper.class);
    MultipleInputs.addInputPath(job5, aggregateSwitchSimOutput, TextInputFormat.class,
            NeighborhoodMapper.class);
    FileOutputFormat.setOutputPath(job5, neighborAllDataOutput);

    job5.getConfiguration().set(Constants.DELIMITER, delimiter);

    boolean step5 = job5.waitForCompletion(true);
    if (!(step5))
        return -1;

    /**
     * Step 3.
     * Arrange prediction items for n users
     */
    logger.info("==========================================================================================");
    logger.info("   Step 6 of the 7 steps : Arrange prediction items for n users. ");
    logger.info("       Input directory [" + input + "]");
    logger.info("       Input directory to setup method [" + itemListOutputPath.toString() + "]");
    logger.info("       Output directory [" + candidateItemListOutput.toString() + "]");
    logger.info("==========================================================================================");

    Job job6 = new Job();
    job6.setJarByClass(UserbasedRecommendationDriver.class);

    job6.setMapperClass(PredictionMapper.class);
    job6.setReducerClass(PredictionReducer.class);

    job6.setMapOutputKeyClass(Text.class);
    job6.setMapOutputValueClass(TextTwoWritableComparable.class);

    job6.setOutputKeyClass(Text.class);
    job6.setOutputValueClass(Text.class);

    FileInputFormat.setInputPaths(job6, new Path(input));
    FileOutputFormat.setOutputPath(job6, candidateItemListOutput);

    job6.getConfiguration().set(Constants.DELIMITER, delimiter);
    job6.getConfiguration().set("itemListPath", itemListOutputPath.toString());

    boolean step6 = job6.waitForCompletion(true);
    if (!(step6))
        return -1;

    /**
     * Step 4.
     * Finally calculator prediction rating of n users
     */
    logger.info("==========================================================================================");
    logger.info("   Step 7 of the 7 steps : Finally calculator prediction rating of n users. ");
    logger.info("       Multi Input directory 1 [" + candidateItemListOutput.toString() + "]");
    logger.info("       Multi Input directory 2 [" + neighborAllDataOutput.toString() + "]");
    logger.info("       Output directory [" + output + "]");
    logger.info("==========================================================================================");

    Job job7 = new Job();
    job7.setJarByClass(UserbasedRecommendationDriver.class);

    job7.setReducerClass(RecommendationReducer.class);

    job7.setMapOutputKeyClass(Text.class);
    job7.setMapOutputValueClass(Text.class);

    job7.setOutputKeyClass(Text.class);
    job7.setOutputValueClass(DoubleWritable.class);

    MultipleInputs.addInputPath(job7, candidateItemListOutput, TextInputFormat.class,
            PredictionItemsMapper.class);
    MultipleInputs.addInputPath(job7, neighborAllDataOutput, TextInputFormat.class, UserSimilarityMapper.class);
    FileOutputFormat.setOutputPath(job7, new Path(output));

    job7.getConfiguration().set(Constants.DELIMITER, delimiter);

    boolean step7 = job7.waitForCompletion(true);
    if (!(step7))
        return -1;

    // Remove all midterm process output files.
    if (removeMode) {
        boolean delete = fileSystem.delete(new Path(fileSystemUri + "/" + prepareOutput), true);
        if (delete) {
            logger.info("Delete midterm process output files.");
        }
    }
    return 0;
}

From source file:org.apache.druid.indexer.path.DatasourcePathSpec.java

License:Apache License

@Override
public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException {
    if (segments == null || segments.isEmpty()) {
        if (ingestionSpec.isIgnoreWhenNoSegments()) {
            logger.warn("No segments found for ingestionSpec [%s]", ingestionSpec);
            return job;
        } else {//from  w  w  w  .  j  av a  2 s.c  om
            throw new ISE("No segments found for ingestion spec [%s]", ingestionSpec);
        }
    }

    logger.info("Found total [%d] segments for [%s]  in interval [%s]", segments.size(),
            ingestionSpec.getDataSource(), ingestionSpec.getIntervals());

    DatasourceIngestionSpec updatedIngestionSpec = ingestionSpec;
    if (updatedIngestionSpec.getDimensions() == null) {
        List<String> dims;
        if (config.getParser().getParseSpec().getDimensionsSpec().hasCustomDimensions()) {
            dims = config.getParser().getParseSpec().getDimensionsSpec().getDimensionNames();
        } else {
            Set<String> dimSet = Sets.newHashSet(Iterables.concat(
                    Iterables.transform(segments, new Function<WindowedDataSegment, Iterable<String>>() {
                        @Override
                        public Iterable<String> apply(WindowedDataSegment dataSegment) {
                            return dataSegment.getSegment().getDimensions();
                        }
                    })));
            dims = Lists.newArrayList(Sets.difference(dimSet,
                    config.getParser().getParseSpec().getDimensionsSpec().getDimensionExclusions()));
        }
        updatedIngestionSpec = updatedIngestionSpec.withDimensions(dims);
    }

    if (updatedIngestionSpec.getMetrics() == null) {
        Set<String> metrics = new HashSet<>();
        final AggregatorFactory[] cols = config.getSchema().getDataSchema().getAggregators();
        if (cols != null) {
            if (useNewAggs) {
                for (AggregatorFactory col : cols) {
                    metrics.addAll(col.requiredFields());
                }
            } else {
                for (AggregatorFactory col : cols) {
                    metrics.add(col.getName());
                }
            }

        }
        updatedIngestionSpec = updatedIngestionSpec.withMetrics(Lists.newArrayList(metrics));
    }

    updatedIngestionSpec = updatedIngestionSpec
            .withQueryGranularity(config.getGranularitySpec().getQueryGranularity());

    // propagate in the transformSpec from the overall job config
    updatedIngestionSpec = updatedIngestionSpec
            .withTransformSpec(config.getSchema().getDataSchema().getTransformSpec());

    DatasourceInputFormat.addDataSource(job.getConfiguration(), updatedIngestionSpec, segments, maxSplitSize);
    MultipleInputs.addInputPath(job, new Path("/dummy/tobe/ignored"), DatasourceInputFormat.class);
    return job;
}