List of usage examples for org.apache.hadoop.mapreduce.lib.input FileInputFormat getInputPaths
public static Path[] getInputPaths(JobContext context)
From source file:alluxio.hadoop.mapreduce.KeyValueInputFormat.java
License:Apache License
/** * Returns a list of {@link KeyValueInputSplit} where each split is one key-value partition. * * @param jobContext MapReduce job configuration * @return list of {@link InputSplit}s, each split is a partition * @throws IOException if information about the partition cannot be retrieved *//*from w w w . j a v a 2s.c o m*/ @Override public List<InputSplit> getSplits(JobContext jobContext) throws IOException { // The paths are MapReduce program's inputs specified in // {@code mapreduce.input.fileinputformat.inputdir}, each path should be a key-value store. Path[] paths = FileInputFormat.getInputPaths(jobContext); List<InputSplit> splits = new ArrayList<>(); try { for (Path path : paths) { List<PartitionInfo> partitionInfos = mKeyValueMasterClient .getPartitionInfo(new AlluxioURI(path.toString())); for (PartitionInfo partitionInfo : partitionInfos) { splits.add(new KeyValueInputSplit(partitionInfo)); } } } catch (AlluxioException e) { throw new IOException(e); } return splits; }
From source file:bdss.cmu.edu.Sort.java
License:Apache License
/** * The main driver for sort program./*from www . j a va 2s .co m*/ * Invoke this method to submit the map/reduce job. * @throws IOException When there is communication problems with the * job tracker. */ public int run(String[] args) throws Exception { Configuration conf = getConf(); JobClient client = new JobClient(conf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = conf.get(REDUCES_PER_HOST); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = BytesWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs job = new Job(conf); job.setJobName("sorter"); job.setJarByClass(Sort.class); job.setMapperClass(Mapper.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(num_reduces); job.setInputFormatClass(inputFormatClass); job.setOutputFormatClass(outputFormatClass); job.setOutputKeyClass(outputKeyClass); job.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(job, otherArgs.get(0)); FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); job.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(job)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile); InputSampler.<K, V>writePartitionFile(job, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, conf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(job)[0] + " into " + FileOutputFormat.getOutputPath(job) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); int ret = job.waitForCompletion(true) ? 0 : 1; Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return ret; }
From source file:co.cask.cdap.hive.datasets.DatasetInputFormat.java
License:Apache License
@Override public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException { try (DatasetAccessor datasetAccessor = new DatasetAccessor(jobConf)) { try {//from w ww . j a v a 2 s . co m datasetAccessor.initialize(); } catch (Exception e) { throw new IOException("Could not get dataset", e); } try (RecordScannable recordScannable = datasetAccessor.getDataset()) { Job job = new Job(jobConf); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); List<Split> dsSplits = recordScannable.getSplits(); InputSplit[] inputSplits = new InputSplit[dsSplits.size()]; for (int i = 0; i < dsSplits.size(); i++) { inputSplits[i] = new DatasetInputSplit(dsSplits.get(i), tablePaths[0]); } return inputSplits; } } }
From source file:co.cask.cdap.hive.stream.HiveStreamInputFormat.java
License:Apache License
private StreamInputSplitFinder<InputSplit> getSplitFinder(JobConf conf) throws IOException { // first get the context we are in ContextManager.Context context = ContextManager.getContext(conf); String streamName = conf.get(Constants.Explore.STREAM_NAME); String streamNamespace = conf.get(Constants.Explore.STREAM_NAMESPACE); Id.Stream streamId = Id.Stream.from(streamNamespace, streamName); StreamConfig streamConfig = context.getStreamConfig(streamId); // make sure we get the current generation so we don't read events that occurred before a truncate. Location streamPath = StreamUtils.createGenerationLocation(streamConfig.getLocation(), StreamUtils.getGeneration(streamConfig)); StreamInputSplitFinder.Builder builder = StreamInputSplitFinder.builder(streamPath.toURI()); // Get the Hive table path for the InputSplit created. It is just to satisfy hive. The InputFormat never uses it. JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(conf)); final Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); return setupBuilder(conf, streamConfig, builder).build(new StreamInputSplitFactory<InputSplit>() { @Override//w w w.ja v a2 s . c om public InputSplit createSplit(Path eventPath, Path indexPath, long startTime, long endTime, long start, long length, @Nullable String[] locations) { return new StreamInputSplit(tablePaths[0], eventPath, indexPath, startTime, endTime, start, length, locations); } }); }
From source file:com.aliyun.openservices.tablestore.hive.TableStoreInputFormat.java
License:Apache License
@Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { Configuration dest = translateConfig(job); SyncClientInterface ots = null;//w ww . jav a 2s . c o m String columns = job.get(TableStoreConsts.COLUMNS_MAPPING); if (columns == null) { columns = job.get(serdeConstants.LIST_COLUMNS); } logger.debug("columns to get: {}", columns); List<org.apache.hadoop.mapreduce.InputSplit> splits; try { ots = TableStore.newOtsClient(dest); TableMeta meta = fetchTableMeta(ots, job.get(TableStoreConsts.TABLE_NAME)); RangeRowQueryCriteria criteria = fetchCriteria(meta, columns); com.aliyun.openservices.tablestore.hadoop.TableStoreInputFormat.addCriteria(dest, criteria); splits = com.aliyun.openservices.tablestore.hadoop.TableStoreInputFormat.getSplits(dest, ots); } finally { if (ots != null) { ots.shutdown(); ots = null; } } InputSplit[] res = new InputSplit[splits.size()]; JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(new Job(job)); Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); int i = 0; for (org.apache.hadoop.mapreduce.InputSplit split : splits) { res[i] = new TableStoreInputSplit( (com.aliyun.openservices.tablestore.hadoop.TableStoreInputSplit) split, tablePaths[0]); ++i; } return res; }
From source file:com.asakusafw.runtime.stage.input.TemporaryInputFormat.java
License:Apache License
/** * Returns input paths./* w w w . jav a 2 s . c om*/ * @param context current job * @return the input paths, or an empty list if they are not set * @throws IOException if failed to resolve paths * @throws IllegalArgumentException if some parameters were {@code null} * @since 0.7.0 */ public static List<Path> getInputPaths(JobContext context) throws IOException { if (context == null) { throw new IllegalArgumentException("job must not be null"); //$NON-NLS-1$ } Path[] paths = FileInputFormat.getInputPaths(context); if (paths == null || paths.length == 0) { return Collections.emptyList(); } return Arrays.asList(paths); }
From source file:com.ask.hive.hbase.HiveHBaseTextTableInputFormat.java
License:Apache License
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException { String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME); setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName))); String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING); if (hbaseColumnsMapping == null) { throw new IOException("hbase.columns.mapping required for HBase Table."); }//from w w w . ja v a2 s .c o m List<String> hbaseColumnFamilies = new ArrayList<String>(); List<String> hbaseColumnQualifiers = new ArrayList<String>(); List<byte[]> hbaseColumnFamiliesBytes = new ArrayList<byte[]>(); List<byte[]> hbaseColumnQualifiersBytes = new ArrayList<byte[]>(); int iKey; try { iKey = HBaseSerDe.parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes, hbaseColumnQualifiers, hbaseColumnQualifiersBytes); } catch (SerDeException se) { throw new IOException(se); } Scan scan = new Scan(); // Take filter pushdown into account while calculating splits; this // allows us to prune off regions immediately. Note that although // the Javadoc for the superclass getSplits says that it returns one // split per region, the implementation actually takes the scan // definition into account and excludes regions which don't satisfy // the start/stop row conditions (HBASE-1829). convertFilter(jobConf, scan, null, iKey); // REVIEW: are we supposed to be applying the getReadColumnIDs // same as in getRecordReader? for (int i = 0; i < hbaseColumnFamilies.size(); i++) { if (i == iKey) { continue; } if (hbaseColumnQualifiers.get(i) == null) { scan.addFamily(hbaseColumnFamiliesBytes.get(i)); } else { scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i)); } } setScan(scan); Job job = new Job(jobConf); JobContext jobContext = new JobContext(job.getConfiguration(), job.getJobID()); Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext); InputSplit[] results = new InputSplit[splits.size()]; for (int i = 0; i < splits.size(); i++) { results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]); } return results; }
From source file:com.cloudera.castagna.logparser.Utils.java
License:Apache License
public static void log(Job job, Logger log) throws ClassNotFoundException { log.debug("{} -> {} ({}, {}) -> {}#{} ({}, {}) -> {}", new Object[] { job.getInputFormatClass().getSimpleName(), job.getMapperClass().getSimpleName(), job.getMapOutputKeyClass().getSimpleName(), job.getMapOutputValueClass().getSimpleName(), job.getReducerClass().getSimpleName(), job.getNumReduceTasks(), job.getOutputKeyClass().getSimpleName(), job.getOutputValueClass().getSimpleName(), job.getOutputFormatClass().getSimpleName() }); Path[] inputs = FileInputFormat.getInputPaths(job); Path output = FileOutputFormat.getOutputPath(job); log.debug("input: {}", inputs[0]); log.debug("output: {}", output); }
From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableInputFormat.java
License:Apache License
private InputSplit[] getSplitsInternal(JobConf jobConf, int numSplits) throws IOException { //obtain delegation tokens for the job if (UserGroupInformation.getCurrentUser().hasKerberosCredentials()) { TableMapReduceUtil.initCredentials(jobConf); }//w w w. j a va 2 s . co m String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME); String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING); boolean doColumnRegexMatching = jobConf.getBoolean(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, true); if (hbaseColumnsMapping == null) { throw new IOException(HBaseSerDe.HBASE_COLUMNS_MAPPING + " required for HBase Table."); } ColumnMappings columnMappings = null; int iTimeColumn = -1; try { columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColumnsMapping, doColumnRegexMatching); iTimeColumn = HBaseSerDe.getTxTimeColumnIndex(columnMappings, jobConf); } catch (SerDeException e) { throw new IOException(e); } int iKey = columnMappings.getKeyIndex(); int iTimestamp = columnMappings.getTimestampIndex(); ColumnMapping keyMapping = columnMappings.getKeyMapping(); if (iTimeColumn != -1) { List<org.apache.hadoop.mapreduce.InputSplit> splits = TxHiveTableInputFormatUtil.getSplits(jobConf, numSplits, columnMappings, iTimeColumn, hbaseTableName); if (splits != null) { Job job = new Job(jobConf); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); InputSplit[] results = new InputSplit[splits.size()]; for (int i = 0; i < splits.size(); i++) { results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0], true); } LOG.info("getSplits: TxHiveIndexScan"); return results; } } LOG.info("getSplits: no TxHiveIndexScan"); setHTable(new HTable(HBaseConfiguration.create(jobConf), Bytes.toBytes(hbaseTableName))); // Take filter pushdown into account while calculating splits; this // allows us to prune off regions immediately. Note that although // the Javadoc for the superclass getSplits says that it returns one // split per region, the implementation actually takes the scan // definition into account and excludes regions which don't satisfy // the start/stop row conditions (HBASE-1829). Scan scan = createFilterScan(jobConf, iKey, iTimestamp, HiveHBaseInputFormatUtil.getStorageFormatOfKey( keyMapping.mappingSpec, jobConf.get(HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE, "string"))); // The list of families that have been added to the scan List<String> addedFamilies = new ArrayList<String>(); // REVIEW: are we supposed to be applying the getReadColumnIDs // same as in getRecordReader? for (ColumnMapping colMap : columnMappings) { if (colMap.hbaseRowKey || colMap.hbaseTimestamp) { continue; } if (colMap.qualifierName == null) { scan.addFamily(colMap.familyNameBytes); addedFamilies.add(colMap.familyName); } else { if (!addedFamilies.contains(colMap.familyName)) { // add the column only if the family has not already been added scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes); } } } setScan(scan); Job job = new Job(jobConf); JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job); Path[] tablePaths = FileInputFormat.getInputPaths(jobContext); List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext); InputSplit[] results = new InputSplit[splits.size()]; for (int i = 0; i < splits.size(); i++) { results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]); } return results; }
From source file:com.phantom.hadoop.examples.Sort.java
License:Apache License
/** * The main driver for sort program. Invoke this method to submit the * map/reduce job./*from w w w . ja v a 2 s . c o m*/ * * @throws IOException * When there is communication problems with the job tracker. */ public int run(String[] args) throws Exception { Configuration conf = getConf(); JobClient client = new JobClient(conf); ClusterStatus cluster = client.getClusterStatus(); int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9); String sort_reduces = conf.get(REDUCES_PER_HOST); if (sort_reduces != null) { num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces); } Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class; Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class; Class<? extends WritableComparable> outputKeyClass = BytesWritable.class; Class<? extends Writable> outputValueClass = BytesWritable.class; List<String> otherArgs = new ArrayList<String>(); InputSampler.Sampler<K, V> sampler = null; for (int i = 0; i < args.length; ++i) { try { if ("-r".equals(args[i])) { num_reduces = Integer.parseInt(args[++i]); } else if ("-inFormat".equals(args[i])) { inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class); } else if ("-outFormat".equals(args[i])) { outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class); } else if ("-outKey".equals(args[i])) { outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class); } else if ("-outValue".equals(args[i])) { outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class); } else if ("-totalOrder".equals(args[i])) { double pcnt = Double.parseDouble(args[++i]); int numSamples = Integer.parseInt(args[++i]); int maxSplits = Integer.parseInt(args[++i]); if (0 >= maxSplits) maxSplits = Integer.MAX_VALUE; sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits); } else { otherArgs.add(args[i]); } } catch (NumberFormatException except) { System.out.println("ERROR: Integer expected instead of " + args[i]); return printUsage(); } catch (ArrayIndexOutOfBoundsException except) { System.out.println("ERROR: Required parameter missing from " + args[i - 1]); return printUsage(); // exits } } // Set user-supplied (possibly default) job configs job = new Job(conf); job.setJobName("sorter"); job.setJarByClass(Sort.class); job.setMapperClass(Mapper.class); job.setReducerClass(Reducer.class); job.setNumReduceTasks(num_reduces); job.setInputFormatClass(inputFormatClass); job.setOutputFormatClass(outputFormatClass); job.setOutputKeyClass(outputKeyClass); job.setOutputValueClass(outputValueClass); // Make sure there are exactly 2 parameters left. if (otherArgs.size() != 2) { System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2."); return printUsage(); } FileInputFormat.setInputPaths(job, otherArgs.get(0)); FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1))); if (sampler != null) { System.out.println("Sampling input to effect total-order sort..."); job.setPartitionerClass(TotalOrderPartitioner.class); Path inputDir = FileInputFormat.getInputPaths(job)[0]; inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf)); Path partitionFile = new Path(inputDir, "_sortPartitioning"); TotalOrderPartitioner.setPartitionFile(conf, partitionFile); InputSampler.<K, V>writePartitionFile(job, sampler); URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning"); DistributedCache.addCacheFile(partitionUri, conf); } System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from " + FileInputFormat.getInputPaths(job)[0] + " into " + FileOutputFormat.getOutputPath(job) + " with " + num_reduces + " reduces."); Date startTime = new Date(); System.out.println("Job started: " + startTime); int ret = job.waitForCompletion(true) ? 0 : 1; Date end_time = new Date(); System.out.println("Job ended: " + end_time); System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds."); return ret; }