List of usage examples for org.apache.hadoop.mapreduce.lib.input MultipleInputs addInputPath
@SuppressWarnings("unchecked") public static void addInputPath(Job job, Path path, Class<? extends InputFormat> inputFormatClass)
From source file:com.conversantmedia.mapreduce.tool.annotation.handler.MultiInputAnnotationHandler.java
License:Apache License
@Override public void process(Annotation annotation, Job job, Object target) throws ToolException { for (Input input : ((MultiInput) annotation).value()) { Path path = getInputAsPath(input.path()); if (input.mapper() == Mapper.class) { MultipleInputs.addInputPath(job, path, input.format()); } else {//from w w w . j av a 2 s .c o m MultipleInputs.addInputPath(job, path, input.format(), input.mapper()); // Need to call again here so the call is captured by our aspect which // will replace it with the annotated delegating mapper class for resource // injection if required. job.setMapperClass(DelegatingMapper.class); } } }
From source file:com.twitter.algebra.matrix.multiply.AtB_DMJ.java
License:Apache License
/** * Perform A x B, where At and B refer to the paths that contain matrices in * {@link SequenceFileInputFormat}. One of At and B must also conform with * {@link MapDir} format. Refer to {@link AtB_DMJ} for further details. * /* w w w . java 2 s . c o m*/ * @param conf the initial configuration * @param mapDirPath path to the matrix in {@link MapDir} format * @param matrixInputPaths the list of paths to matrix input partitions over * which we iterate * @param matrixOutputPath path to which AxB will be written * @param atCols number of columns of At (rows of A) * @param bCols * @param colsPerPartition cols per partition of the input matrix (whether At or B) * @param aIsMapDir is A chosen to be loaded as MapDir * @param useInMemCombiner * @param numberOfJobs the hint for the desired number of parallel jobs * @return the running job * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public Job run(Configuration conf, Path mapDirPath, Path matrixInputPaths, Path matrixOutputPath, int atCols, int bCols, int colsPerPartition, boolean aIsMapDir, boolean useInMemCombiner) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.set(MATRIXINMEMORY, mapDirPath.toString()); conf.setBoolean(AISMAPDIR, aIsMapDir); conf.setBoolean(USEINMEMCOMBINER, useInMemCombiner); conf.setInt(RESULTROWS, atCols); conf.setInt(RESULTCOLS, bCols); conf.setInt(PARTITIONCOLS, colsPerPartition); FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPaths, "dmj"); if (useInMemCombiner) { Configuration newConf = new Configuration(conf); newConf.set("mapreduce.task.io.sort.mb", "1"); conf = newConf; } @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(AtB_DMJ.class); job.setJobName(AtB_DMJ.class.getSimpleName()); matrixOutputPath = fs.makeQualified(matrixOutputPath); matrixInputPaths = fs.makeQualified(matrixInputPaths); MultipleInputs.addInputPath(job, matrixInputPaths, SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); if (!useInMemCombiner) job.setCombinerClass(AtBOuterStaticMapsideJoinJob.MyReducer.class); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "dmj"); job.setNumReduceTasks(numReducers); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, atCols); job.setReducerClass(EpsilonReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); return job; }
From source file:com.twitter.algebra.matrix.text.Matrix2TextJob.java
License:Apache License
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath) throws IOException, InterruptedException, ClassNotFoundException { @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(Matrix2TextJob.class); job.setJobName(Matrix2TextJob.class.getSimpleName()); FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf); matrixInputPath = fs.makeQualified(matrixInputPath); matrixOutputPath = fs.makeQualified(matrixOutputPath); // FileInputFormat.addInputPath(job, matrixInputPath); MultipleInputs.addInputPath(job, matrixInputPath, SequenceFileInputFormat.class); // job.setInputFormatClass(SequenceFileInputFormat.class); TextOutputFormat.setOutputPath(job, matrixOutputPath); job.setNumReduceTasks(0);//from w w w.j av a2 s . c om job.setOutputFormatClass(TextOutputFormat.class); // job.setOutputKeyClass(IntWritable.class); // job.setOutputValueClass(org.apache.hadoop.io.Text); job.setMapperClass(IdMapper.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); }
From source file:com.twitter.algebra.nmf.ColPartitionJob.java
License:Apache License
/** * Partition A on columns, where A refers to the path that contain a matrix in * {@link SequenceFileInputFormat}. Refer to {@link ColPartitionJob} for * further details./* w w w . j a v a 2 s . c o m*/ * * @param conf the initial configuration * @param matrixInputPath the path to the input matrix A * @param matrixOutputPath the path of the resulting partitioned matrix * @param numInputRows rows * @param numInputCols cols * @param numColPartitions the hint for the desired number of column * partitions * @return the running job * @throws IOException * @throws InterruptedException * @throws ClassNotFoundException */ public Job run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows, int numInputCols, int numColPartitions) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf); int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "colpartition"); int colPartSize = getColPartitionSize(numInputCols, numColPartitions); numColPartitions = (int) Math.ceil(numInputCols / (double) colPartSize); if (numReducers < numColPartitions) numReducers = numColPartitions; NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "colpartition"); conf.setInt(NUM_ORIG_ROWS_KEY, numInputRows); conf.setInt(NUM_ORIG_COLS_KEY, numInputCols); conf.setInt(NUM_COL_PARTITIONS, numColPartitions); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(ColPartitionJob.class); job.setJobName(ColPartitionJob.class.getSimpleName()); matrixOutputPath = fs.makeQualified(matrixOutputPath); MultipleInputs.addInputPath(job, matrixInputPath, SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(ElementWritable.class); job.setMapOutputValueClass(VectorWritable.class); RowColPartitioner.setPartitioner(job, RowColPartitioner.ElementRowColPartitioner.class, numInputRows, numInputCols, numColPartitions); job.setReducerClass(MyReducer.class); job.setNumReduceTasks(numReducers); // job.setOutputFormatClass(SequenceFileOutputFormat.class); LazyOutputFormat.setOutputFormatClass(job, SequenceFileOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed!"); return job; }
From source file:com.twitter.algebra.nmf.CompositeDMJ.java
License:Apache License
public Job run(Configuration conf, Path mapDirPath, Path matrixInputPaths, Path matrixOutputPath, int atCols, boolean aIsMapDir, String inMemCStr, int inMemCRows, int inMemCCols, float alpha1, float alpha2) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.set(MATRIXINMEMORY, inMemCStr); conf.setInt(MATRIXINMEMORYROWS, inMemCRows); conf.setInt(MATRIXINMEMORYCOLS, inMemCCols); conf.setFloat(ALPHA1, alpha1);//from w ww . j ava 2s . co m conf.setFloat(ALPHA2, alpha2); FileSystem fs = FileSystem.get(matrixOutputPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPaths, "compositedmj"); conf.set(MAPDIRMATRIX, mapDirPath.toString()); conf.setBoolean(AISMAPDIR, aIsMapDir); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(CompositeDMJ.class); job.setJobName(CompositeDMJ.class.getSimpleName() + "-" + matrixOutputPath.getName()); matrixOutputPath = fs.makeQualified(matrixOutputPath); matrixInputPaths = fs.makeQualified(matrixInputPaths); MultipleInputs.addInputPath(job, matrixInputPaths, SequenceFileInputFormat.class); FileOutputFormat.setOutputPath(job, matrixOutputPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); // ensures total order (when used with {@link MatrixOutputFormat}), RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, atCols); job.setNumReduceTasks(0); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); return job; }
From source file:com.twitter.algebra.nmf.ErrDMJ.java
License:Apache License
public Job run(Configuration conf, Path xPath, Path matrixAInputPath, Path ytPath, Path outPath, int aRows, int ytRows, int ytCols) throws IOException, InterruptedException, ClassNotFoundException { conf = new Configuration(conf); conf.set(MAPDIRMATRIXX, xPath.toString()); conf.set(MAPDIRMATRIXYT, ytPath.toString()); conf.setInt(YTROWS, ytRows);//from w w w.j av a 2 s .c o m conf.setInt(YTCOLS, ytCols); FileSystem fs = FileSystem.get(outPath.toUri(), conf); NMFCommon.setNumberOfMapSlots(conf, fs, matrixAInputPath, "err"); @SuppressWarnings("deprecation") Job job = new Job(conf); job.setJarByClass(ErrDMJ.class); job.setJobName(ErrDMJ.class.getSimpleName() + "-" + outPath.getName()); matrixAInputPath = fs.makeQualified(matrixAInputPath); MultipleInputs.addInputPath(job, matrixAInputPath, SequenceFileInputFormat.class); outPath = fs.makeQualified(outPath); FileOutputFormat.setOutputPath(job, outPath); job.setMapperClass(MyMapper.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(VectorWritable.class); int numReducers = 1; job.setNumReduceTasks(numReducers); job.setCombinerClass(SumVectorsReducer.class); job.setReducerClass(SumVectorsReducer.class); job.setOutputFormatClass(MatrixOutputFormat.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(VectorWritable.class); job.submit(); boolean res = job.waitForCompletion(true); if (!res) throw new IOException("Job failed! "); return job; }
From source file:io.druid.indexer.path.DatasourcePathSpec.java
License:Apache License
@Override public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException { Preconditions.checkArgument(segments != null && !segments.isEmpty(), "no segments provided"); logger.info("Found total [%d] segments for [%s] in interval [%s]", segments.size(), ingestionSpec.getDataSource(), ingestionSpec.getInterval()); DatasourceIngestionSpec updatedIngestionSpec = ingestionSpec; if (updatedIngestionSpec.getDimensions() == null) { List<String> dims; if (config.getParser().getParseSpec().getDimensionsSpec().hasCustomDimensions()) { dims = config.getParser().getParseSpec().getDimensionsSpec().getDimensions(); } else {/*from w ww . j a va 2 s . com*/ Set<String> dimSet = Sets.newHashSet(Iterables.concat( Iterables.transform(segments, new Function<WindowedDataSegment, Iterable<String>>() { @Override public Iterable<String> apply(WindowedDataSegment dataSegment) { return dataSegment.getSegment().getDimensions(); } }))); dims = Lists.newArrayList(Sets.difference(dimSet, config.getParser().getParseSpec().getDimensionsSpec().getDimensionExclusions())); } updatedIngestionSpec = updatedIngestionSpec.withDimensions(dims); } if (updatedIngestionSpec.getMetrics() == null) { Set<String> metrics = Sets.newHashSet(); final AggregatorFactory[] cols = config.getSchema().getDataSchema().getAggregators(); if (cols != null) { for (AggregatorFactory col : cols) { metrics.add(col.getName()); } } updatedIngestionSpec = updatedIngestionSpec.withMetrics(Lists.newArrayList(metrics)); } updatedIngestionSpec = updatedIngestionSpec .withQueryGranularity(config.getGranularitySpec().getQueryGranularity()); job.getConfiguration().set(DatasourceInputFormat.CONF_DRUID_SCHEMA, mapper.writeValueAsString(updatedIngestionSpec)); job.getConfiguration().set(DatasourceInputFormat.CONF_INPUT_SEGMENTS, mapper.writeValueAsString(segments)); job.getConfiguration().set(DatasourceInputFormat.CONF_MAX_SPLIT_SIZE, String.valueOf(maxSplitSize)); MultipleInputs.addInputPath(job, new Path("/dummy/tobe/ignored"), DatasourceInputFormat.class); return job; }
From source file:io.druid.indexer.path.StaticPathSpec.java
License:Apache License
public final static void addToMultipleInputs(HadoopDruidIndexerConfig config, Job job, String path, Class<? extends InputFormat> inputFormatClass) { if (path == null) { return;/*w w w.j av a 2s.c o m*/ } Class<? extends InputFormat> inputFormatClassToUse = inputFormatClass; if (inputFormatClassToUse == null) { if (config.isCombineText()) { inputFormatClassToUse = CombineTextInputFormat.class; } else { inputFormatClassToUse = TextInputFormat.class; } } // Due to https://issues.apache.org/jira/browse/MAPREDUCE-5061 we can't directly do // MultipleInputs.addInputPath(job, path, inputFormatClassToUse) // but have to handle hadoop glob path ourselves correctly // This change and HadoopGlobPathSplitter.java can be removed once the hadoop issue is fixed for (StringBuilder sb : HadoopGlobPathSplitter.splitGlob(path)) { MultipleInputs.addInputPath(job, new Path(sb.toString()), inputFormatClassToUse); } }
From source file:org.ankus.mapreduce.algorithms.recommendation.recommender.driver.UserbasedRecommendationDriver.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length < 1) { Usage.printUsage(Constants.ALGORITHM_USER_BASED_RECOMMENDATION); return -1; }/*from w w w.ja v a 2 s . c o m*/ initArguments(args); // Get key (midterm.process.output.remove.mode) from config.properties Properties configProperties = AnkusUtils.getConfigProperties(); String removeModeMidtermProcess = configProperties.get(Constants.MIDTERM_PROCESS_OUTPUT_REMOVE_MODE) .toString(); boolean removeMode = false; if (removeModeMidtermProcess.equals(Constants.REMOVE_ON)) { removeMode = true; } // Get prepare output path for in the middle of job processing String prepareDirectory = AnkusUtils.createDirectoryForHDFS(output); String prepareOutput = prepareDirectory + "/"; fileSystem = FileSystem.get(new Configuration()); URI fileSystemUri = fileSystem.getUri(); Path itemListOutputPath = new Path(fileSystemUri + "/" + prepareOutput + "itemlist"); Path candidateItemListOutput = new Path(fileSystemUri + "/" + prepareOutput + "candidate"); Path switchSimilarityOutput1 = new Path(fileSystemUri + "/" + prepareOutput + "switchSimilarity1"); Path switchSimilarityOutput2 = new Path(fileSystemUri + "/" + prepareOutput + "switchSimilarity2"); Path aggregateSwitchSimOutput = new Path(fileSystemUri + "/" + prepareOutput + "aggregate"); Path neighborAllDataOutput = new Path(fileSystemUri + "/" + prepareOutput + "neighborhood"); /** * Step 1. * Arrange only item list of test data set(base input data set) */ logger.info("=========================================================================================="); logger.info(" Step 1 of the 7 steps : Arrange only item list for input data set. "); logger.info(" Input directory [" + input + "]"); logger.info(" Output directory [" + itemListOutputPath.toString() + "]"); logger.info("=========================================================================================="); Job job1 = new Job(); job1.setJarByClass(UserbasedRecommendationDriver.class); job1.setMapperClass(ItemListMapper.class); job1.setReducerClass(ItemListReducer.class); job1.setMapOutputKeyClass(Text.class); job1.setMapOutputValueClass(NullWritable.class); job1.setOutputKeyClass(Text.class); job1.setOutputValueClass(NullWritable.class); FileInputFormat.setInputPaths(job1, new Path(input)); FileOutputFormat.setOutputPath(job1, itemListOutputPath); job1.getConfiguration().set(Constants.DELIMITER, delimiter); boolean step1 = job1.waitForCompletion(true); if (!(step1)) return -1; /** * Step 1-1. * Arrange similar users from similarity data set and movielens data set */ logger.info("=========================================================================================="); logger.info( " Step 2 of the 7 steps : Arrange similar users from similarity data set and movielens data set. "); logger.info(" Input directory [" + similarDataInput + "]"); logger.info(" Output directory [" + switchSimilarityOutput1.toString() + "]"); logger.info("=========================================================================================="); Job job2 = new Job(); job2.setJarByClass(UserbasedRecommendationDriver.class); job2.setMapperClass(Neighborhood1Mapper.class); job2.setReducerClass(Neighborhood1Reducer.class); job2.setMapOutputKeyClass(Text.class); job2.setMapOutputValueClass(Text.class); job2.setOutputKeyClass(Text.class); job2.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job2, similarDataInput); FileOutputFormat.setOutputPath(job2, switchSimilarityOutput1); job2.getConfiguration().set(Constants.DELIMITER, delimiter); boolean step2 = job2.waitForCompletion(true); if (!(step2)) return -1; /** * Step 1-2. * Opposite arrange similar users from similarity data set and movielens data set */ logger.info("=========================================================================================="); logger.info( " Step 3 of the 7 steps : Opposite arrange similar users from similarity data set and movielens data set. "); logger.info(" Input directory [" + similarDataInput + "]"); logger.info(" Output directory [" + switchSimilarityOutput2.toString() + "]"); logger.info("=========================================================================================="); Job job3 = new Job(); job3.setJarByClass(UserbasedRecommendationDriver.class); job3.setMapperClass(Neighborhood2Mapper.class); job3.setReducerClass(Neighborhood2Reducer.class); job3.setMapOutputKeyClass(Text.class); job3.setMapOutputValueClass(Text.class); job3.setOutputKeyClass(Text.class); job3.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job3, similarDataInput); FileOutputFormat.setOutputPath(job3, switchSimilarityOutput2); job3.getConfiguration().set(Constants.DELIMITER, delimiter); boolean step3 = job3.waitForCompletion(true); if (!(step3)) return -1; /** * 1-3 Aggregate two similarity result data set */ logger.info("=========================================================================================="); logger.info(" Step 4 of the 7 steps : Aggregate step 2 and step 3 result data set. "); logger.info(" Multi Input directory 1 [" + switchSimilarityOutput1 + "]"); logger.info(" Multi Input directory 2 [" + switchSimilarityOutput2 + "]"); logger.info(" Output directory [" + aggregateSwitchSimOutput.toString() + "]"); logger.info("=========================================================================================="); Job job4 = new Job(); job4.setJarByClass(UserbasedRecommendationDriver.class); job4.setMapperClass(AggregateMapper.class); job4.setMapOutputKeyClass(NullWritable.class); job4.setMapOutputValueClass(Text.class); MultipleInputs.addInputPath(job4, switchSimilarityOutput1, TextInputFormat.class); MultipleInputs.addInputPath(job4, switchSimilarityOutput2, TextInputFormat.class); FileOutputFormat.setOutputPath(job4, aggregateSwitchSimOutput); boolean step4 = job4.waitForCompletion(true); if (!(step4)) return -1; /** * Step 2. * Join movielens data set and similarity(neighborhood) user list */ logger.info("=========================================================================================="); logger.info(" Step 5 of the 7 steps : Join movielens data set and similarity(step 4) user list. "); logger.info(" Multi Input directory 1 [" + input + "]"); logger.info(" Multi Input directory 2 [" + aggregateSwitchSimOutput.toString() + "]"); logger.info(" Output directory [" + neighborAllDataOutput.toString() + "]"); logger.info("=========================================================================================="); Job job5 = new Job(); job5.setJarByClass(UserbasedRecommendationDriver.class); job5.setReducerClass(NeighborhoodReducer.class); job5.setMapOutputKeyClass(Text.class); job5.setMapOutputValueClass(Text.class); job5.setOutputKeyClass(Text.class); job5.setOutputValueClass(Text.class); MultipleInputs.addInputPath(job5, new Path(input), TextInputFormat.class, MovielensMapper.class); MultipleInputs.addInputPath(job5, aggregateSwitchSimOutput, TextInputFormat.class, NeighborhoodMapper.class); FileOutputFormat.setOutputPath(job5, neighborAllDataOutput); job5.getConfiguration().set(Constants.DELIMITER, delimiter); boolean step5 = job5.waitForCompletion(true); if (!(step5)) return -1; /** * Step 3. * Arrange prediction items for n users */ logger.info("=========================================================================================="); logger.info(" Step 6 of the 7 steps : Arrange prediction items for n users. "); logger.info(" Input directory [" + input + "]"); logger.info(" Input directory to setup method [" + itemListOutputPath.toString() + "]"); logger.info(" Output directory [" + candidateItemListOutput.toString() + "]"); logger.info("=========================================================================================="); Job job6 = new Job(); job6.setJarByClass(UserbasedRecommendationDriver.class); job6.setMapperClass(PredictionMapper.class); job6.setReducerClass(PredictionReducer.class); job6.setMapOutputKeyClass(Text.class); job6.setMapOutputValueClass(TextTwoWritableComparable.class); job6.setOutputKeyClass(Text.class); job6.setOutputValueClass(Text.class); FileInputFormat.setInputPaths(job6, new Path(input)); FileOutputFormat.setOutputPath(job6, candidateItemListOutput); job6.getConfiguration().set(Constants.DELIMITER, delimiter); job6.getConfiguration().set("itemListPath", itemListOutputPath.toString()); boolean step6 = job6.waitForCompletion(true); if (!(step6)) return -1; /** * Step 4. * Finally calculator prediction rating of n users */ logger.info("=========================================================================================="); logger.info(" Step 7 of the 7 steps : Finally calculator prediction rating of n users. "); logger.info(" Multi Input directory 1 [" + candidateItemListOutput.toString() + "]"); logger.info(" Multi Input directory 2 [" + neighborAllDataOutput.toString() + "]"); logger.info(" Output directory [" + output + "]"); logger.info("=========================================================================================="); Job job7 = new Job(); job7.setJarByClass(UserbasedRecommendationDriver.class); job7.setReducerClass(RecommendationReducer.class); job7.setMapOutputKeyClass(Text.class); job7.setMapOutputValueClass(Text.class); job7.setOutputKeyClass(Text.class); job7.setOutputValueClass(DoubleWritable.class); MultipleInputs.addInputPath(job7, candidateItemListOutput, TextInputFormat.class, PredictionItemsMapper.class); MultipleInputs.addInputPath(job7, neighborAllDataOutput, TextInputFormat.class, UserSimilarityMapper.class); FileOutputFormat.setOutputPath(job7, new Path(output)); job7.getConfiguration().set(Constants.DELIMITER, delimiter); boolean step7 = job7.waitForCompletion(true); if (!(step7)) return -1; // Remove all midterm process output files. if (removeMode) { boolean delete = fileSystem.delete(new Path(fileSystemUri + "/" + prepareOutput), true); if (delete) { logger.info("Delete midterm process output files."); } } return 0; }
From source file:org.apache.druid.indexer.path.DatasourcePathSpec.java
License:Apache License
@Override public Job addInputPaths(HadoopDruidIndexerConfig config, Job job) throws IOException { if (segments == null || segments.isEmpty()) { if (ingestionSpec.isIgnoreWhenNoSegments()) { logger.warn("No segments found for ingestionSpec [%s]", ingestionSpec); return job; } else {//from w w w . j av a 2 s.c om throw new ISE("No segments found for ingestion spec [%s]", ingestionSpec); } } logger.info("Found total [%d] segments for [%s] in interval [%s]", segments.size(), ingestionSpec.getDataSource(), ingestionSpec.getIntervals()); DatasourceIngestionSpec updatedIngestionSpec = ingestionSpec; if (updatedIngestionSpec.getDimensions() == null) { List<String> dims; if (config.getParser().getParseSpec().getDimensionsSpec().hasCustomDimensions()) { dims = config.getParser().getParseSpec().getDimensionsSpec().getDimensionNames(); } else { Set<String> dimSet = Sets.newHashSet(Iterables.concat( Iterables.transform(segments, new Function<WindowedDataSegment, Iterable<String>>() { @Override public Iterable<String> apply(WindowedDataSegment dataSegment) { return dataSegment.getSegment().getDimensions(); } }))); dims = Lists.newArrayList(Sets.difference(dimSet, config.getParser().getParseSpec().getDimensionsSpec().getDimensionExclusions())); } updatedIngestionSpec = updatedIngestionSpec.withDimensions(dims); } if (updatedIngestionSpec.getMetrics() == null) { Set<String> metrics = new HashSet<>(); final AggregatorFactory[] cols = config.getSchema().getDataSchema().getAggregators(); if (cols != null) { if (useNewAggs) { for (AggregatorFactory col : cols) { metrics.addAll(col.requiredFields()); } } else { for (AggregatorFactory col : cols) { metrics.add(col.getName()); } } } updatedIngestionSpec = updatedIngestionSpec.withMetrics(Lists.newArrayList(metrics)); } updatedIngestionSpec = updatedIngestionSpec .withQueryGranularity(config.getGranularitySpec().getQueryGranularity()); // propagate in the transformSpec from the overall job config updatedIngestionSpec = updatedIngestionSpec .withTransformSpec(config.getSchema().getDataSchema().getTransformSpec()); DatasourceInputFormat.addDataSource(job.getConfiguration(), updatedIngestionSpec, segments, maxSplitSize); MultipleInputs.addInputPath(job, new Path("/dummy/tobe/ignored"), DatasourceInputFormat.class); return job; }