Example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat getInputPaths

List of usage examples for org.apache.hadoop.mapreduce.lib.input FileInputFormat getInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat getInputPaths.

Prototype

public static Path[] getInputPaths(JobContext context) 

Source Link

Document

Get the list of input Path s for the map-reduce job.

Usage

From source file:alluxio.hadoop.mapreduce.KeyValueInputFormat.java

License:Apache License

/**
 * Returns a list of {@link KeyValueInputSplit} where each split is one key-value partition.
 *
 * @param jobContext MapReduce job configuration
 * @return list of {@link InputSplit}s, each split is a partition
 * @throws IOException if information about the partition cannot be retrieved
 *//*from  w  w w  .  j a  v  a 2s.c  o m*/
@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException {
    // The paths are MapReduce program's inputs specified in
    // {@code mapreduce.input.fileinputformat.inputdir}, each path should be a key-value store.
    Path[] paths = FileInputFormat.getInputPaths(jobContext);
    List<InputSplit> splits = new ArrayList<>();
    try {
        for (Path path : paths) {
            List<PartitionInfo> partitionInfos = mKeyValueMasterClient
                    .getPartitionInfo(new AlluxioURI(path.toString()));
            for (PartitionInfo partitionInfo : partitionInfos) {
                splits.add(new KeyValueInputSplit(partitionInfo));
            }
        }
    } catch (AlluxioException e) {
        throw new IOException(e);
    }
    return splits;
}

From source file:bdss.cmu.edu.Sort.java

License:Apache License

/**
 * The main driver for sort program./*from   www .  j  a  va 2s  .co m*/
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the
 *                     job tracker.
 */
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = conf.get(REDUCES_PER_HOST);
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }
    // Set user-supplied (possibly default) job configs
    job = new Job(conf);
    job.setJobName("sorter");
    job.setJarByClass(Sort.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(Reducer.class);

    job.setNumReduceTasks(num_reduces);

    job.setInputFormatClass(inputFormatClass);
    job.setOutputFormatClass(outputFormatClass);

    job.setOutputKeyClass(outputKeyClass);
    job.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(job, otherArgs.get(0));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        job.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(job)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
        InputSampler.<K, V>writePartitionFile(job, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, conf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(job)[0] + " into " + FileOutputFormat.getOutputPath(job) + " with "
            + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return ret;
}

From source file:co.cask.cdap.hive.datasets.DatasetInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    try (DatasetAccessor datasetAccessor = new DatasetAccessor(jobConf)) {
        try {//from w  ww  . j a  v a 2 s .  co m
            datasetAccessor.initialize();
        } catch (Exception e) {
            throw new IOException("Could not get dataset", e);
        }
        try (RecordScannable recordScannable = datasetAccessor.getDataset()) {
            Job job = new Job(jobConf);
            JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
            Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

            List<Split> dsSplits = recordScannable.getSplits();

            InputSplit[] inputSplits = new InputSplit[dsSplits.size()];
            for (int i = 0; i < dsSplits.size(); i++) {
                inputSplits[i] = new DatasetInputSplit(dsSplits.get(i), tablePaths[0]);
            }
            return inputSplits;
        }
    }
}

From source file:co.cask.cdap.hive.stream.HiveStreamInputFormat.java

License:Apache License

private StreamInputSplitFinder<InputSplit> getSplitFinder(JobConf conf) throws IOException {
    // first get the context we are in
    ContextManager.Context context = ContextManager.getContext(conf);

    String streamName = conf.get(Constants.Explore.STREAM_NAME);
    String streamNamespace = conf.get(Constants.Explore.STREAM_NAMESPACE);
    Id.Stream streamId = Id.Stream.from(streamNamespace, streamName);
    StreamConfig streamConfig = context.getStreamConfig(streamId);
    // make sure we get the current generation so we don't read events that occurred before a truncate.
    Location streamPath = StreamUtils.createGenerationLocation(streamConfig.getLocation(),
            StreamUtils.getGeneration(streamConfig));

    StreamInputSplitFinder.Builder builder = StreamInputSplitFinder.builder(streamPath.toURI());

    // Get the Hive table path for the InputSplit created. It is just to satisfy hive. The InputFormat never uses it.
    JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(conf));
    final Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

    return setupBuilder(conf, streamConfig, builder).build(new StreamInputSplitFactory<InputSplit>() {
        @Override//w  w w.ja v  a2  s .  c om
        public InputSplit createSplit(Path eventPath, Path indexPath, long startTime, long endTime, long start,
                long length, @Nullable String[] locations) {
            return new StreamInputSplit(tablePaths[0], eventPath, indexPath, startTime, endTime, start, length,
                    locations);
        }
    });
}

From source file:com.aliyun.openservices.tablestore.hive.TableStoreInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Configuration dest = translateConfig(job);
    SyncClientInterface ots = null;//w ww . jav a 2s .  c  o  m
    String columns = job.get(TableStoreConsts.COLUMNS_MAPPING);
    if (columns == null) {
        columns = job.get(serdeConstants.LIST_COLUMNS);
    }
    logger.debug("columns to get: {}", columns);
    List<org.apache.hadoop.mapreduce.InputSplit> splits;
    try {
        ots = TableStore.newOtsClient(dest);
        TableMeta meta = fetchTableMeta(ots, job.get(TableStoreConsts.TABLE_NAME));
        RangeRowQueryCriteria criteria = fetchCriteria(meta, columns);
        com.aliyun.openservices.tablestore.hadoop.TableStoreInputFormat.addCriteria(dest, criteria);
        splits = com.aliyun.openservices.tablestore.hadoop.TableStoreInputFormat.getSplits(dest, ots);
    } finally {
        if (ots != null) {
            ots.shutdown();
            ots = null;
        }
    }
    InputSplit[] res = new InputSplit[splits.size()];
    JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(new Job(job));
    Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
    int i = 0;
    for (org.apache.hadoop.mapreduce.InputSplit split : splits) {
        res[i] = new TableStoreInputSplit(
                (com.aliyun.openservices.tablestore.hadoop.TableStoreInputSplit) split, tablePaths[0]);
        ++i;
    }
    return res;
}

From source file:com.asakusafw.runtime.stage.input.TemporaryInputFormat.java

License:Apache License

/**
 * Returns input paths./*  w  w  w .  jav a 2 s  .  c om*/
 * @param context current job
 * @return the input paths, or an empty list if they are not set
 * @throws IOException if failed to resolve paths
 * @throws IllegalArgumentException if some parameters were {@code null}
 * @since 0.7.0
 */
public static List<Path> getInputPaths(JobContext context) throws IOException {
    if (context == null) {
        throw new IllegalArgumentException("job must not be null"); //$NON-NLS-1$
    }
    Path[] paths = FileInputFormat.getInputPaths(context);
    if (paths == null || paths.length == 0) {
        return Collections.emptyList();
    }
    return Arrays.asList(paths);
}

From source file:com.ask.hive.hbase.HiveHBaseTextTableInputFormat.java

License:Apache License

public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {

    String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
    setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName)));
    String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);

    if (hbaseColumnsMapping == null) {
        throw new IOException("hbase.columns.mapping required for HBase Table.");
    }//from  w  w  w  . ja  v  a2  s  .c o  m

    List<String> hbaseColumnFamilies = new ArrayList<String>();
    List<String> hbaseColumnQualifiers = new ArrayList<String>();
    List<byte[]> hbaseColumnFamiliesBytes = new ArrayList<byte[]>();
    List<byte[]> hbaseColumnQualifiersBytes = new ArrayList<byte[]>();

    int iKey;
    try {
        iKey = HBaseSerDe.parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes,
                hbaseColumnQualifiers, hbaseColumnQualifiersBytes);
    } catch (SerDeException se) {
        throw new IOException(se);
    }

    Scan scan = new Scan();

    // Take filter pushdown into account while calculating splits; this
    // allows us to prune off regions immediately.  Note that although
    // the Javadoc for the superclass getSplits says that it returns one
    // split per region, the implementation actually takes the scan
    // definition into account and excludes regions which don't satisfy
    // the start/stop row conditions (HBASE-1829).
    convertFilter(jobConf, scan, null, iKey);

    // REVIEW:  are we supposed to be applying the getReadColumnIDs
    // same as in getRecordReader?
    for (int i = 0; i < hbaseColumnFamilies.size(); i++) {
        if (i == iKey) {
            continue;
        }

        if (hbaseColumnQualifiers.get(i) == null) {
            scan.addFamily(hbaseColumnFamiliesBytes.get(i));
        } else {
            scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i));
        }
    }

    setScan(scan);
    Job job = new Job(jobConf);
    JobContext jobContext = new JobContext(job.getConfiguration(), job.getJobID());
    Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

    List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext);
    InputSplit[] results = new InputSplit[splits.size()];

    for (int i = 0; i < splits.size(); i++) {
        results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]);
    }

    return results;
}

From source file:com.cloudera.castagna.logparser.Utils.java

License:Apache License

public static void log(Job job, Logger log) throws ClassNotFoundException {
    log.debug("{} -> {} ({}, {}) -> {}#{} ({}, {}) -> {}",
            new Object[] { job.getInputFormatClass().getSimpleName(), job.getMapperClass().getSimpleName(),
                    job.getMapOutputKeyClass().getSimpleName(), job.getMapOutputValueClass().getSimpleName(),
                    job.getReducerClass().getSimpleName(), job.getNumReduceTasks(),
                    job.getOutputKeyClass().getSimpleName(), job.getOutputValueClass().getSimpleName(),
                    job.getOutputFormatClass().getSimpleName() });
    Path[] inputs = FileInputFormat.getInputPaths(job);
    Path output = FileOutputFormat.getOutputPath(job);
    log.debug("input: {}", inputs[0]);
    log.debug("output: {}", output);
}

From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableInputFormat.java

License:Apache License

private InputSplit[] getSplitsInternal(JobConf jobConf, int numSplits) throws IOException {

    //obtain delegation tokens for the job
    if (UserGroupInformation.getCurrentUser().hasKerberosCredentials()) {
        TableMapReduceUtil.initCredentials(jobConf);
    }//w  w  w.  j  a  va  2 s .  co m

    String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
    String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
    boolean doColumnRegexMatching = jobConf.getBoolean(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, true);

    if (hbaseColumnsMapping == null) {
        throw new IOException(HBaseSerDe.HBASE_COLUMNS_MAPPING + " required for HBase Table.");
    }

    ColumnMappings columnMappings = null;
    int iTimeColumn = -1;
    try {
        columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColumnsMapping, doColumnRegexMatching);
        iTimeColumn = HBaseSerDe.getTxTimeColumnIndex(columnMappings, jobConf);
    } catch (SerDeException e) {
        throw new IOException(e);
    }

    int iKey = columnMappings.getKeyIndex();
    int iTimestamp = columnMappings.getTimestampIndex();
    ColumnMapping keyMapping = columnMappings.getKeyMapping();

    if (iTimeColumn != -1) {
        List<org.apache.hadoop.mapreduce.InputSplit> splits = TxHiveTableInputFormatUtil.getSplits(jobConf,
                numSplits, columnMappings, iTimeColumn, hbaseTableName);
        if (splits != null) {
            Job job = new Job(jobConf);
            JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
            Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

            InputSplit[] results = new InputSplit[splits.size()];
            for (int i = 0; i < splits.size(); i++) {
                results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0], true);
            }
            LOG.info("getSplits: TxHiveIndexScan");
            return results;
        }
    }
    LOG.info("getSplits: no TxHiveIndexScan");

    setHTable(new HTable(HBaseConfiguration.create(jobConf), Bytes.toBytes(hbaseTableName)));
    // Take filter pushdown into account while calculating splits; this
    // allows us to prune off regions immediately.  Note that although
    // the Javadoc for the superclass getSplits says that it returns one
    // split per region, the implementation actually takes the scan
    // definition into account and excludes regions which don't satisfy
    // the start/stop row conditions (HBASE-1829).
    Scan scan = createFilterScan(jobConf, iKey, iTimestamp, HiveHBaseInputFormatUtil.getStorageFormatOfKey(
            keyMapping.mappingSpec, jobConf.get(HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE, "string")));

    // The list of families that have been added to the scan
    List<String> addedFamilies = new ArrayList<String>();

    // REVIEW:  are we supposed to be applying the getReadColumnIDs
    // same as in getRecordReader?
    for (ColumnMapping colMap : columnMappings) {
        if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
            continue;
        }

        if (colMap.qualifierName == null) {
            scan.addFamily(colMap.familyNameBytes);
            addedFamilies.add(colMap.familyName);
        } else {
            if (!addedFamilies.contains(colMap.familyName)) {
                // add the column only if the family has not already been added
                scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
            }
        }
    }
    setScan(scan);

    Job job = new Job(jobConf);
    JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
    Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

    List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext);
    InputSplit[] results = new InputSplit[splits.size()];

    for (int i = 0; i < splits.size(); i++) {
        results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]);
    }

    return results;
}

From source file:com.phantom.hadoop.examples.Sort.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job./*from w w w  .  ja  v  a 2  s .  c o  m*/
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = conf.get(REDUCES_PER_HOST);
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }
    // Set user-supplied (possibly default) job configs
    job = new Job(conf);
    job.setJobName("sorter");
    job.setJarByClass(Sort.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(Reducer.class);

    job.setNumReduceTasks(num_reduces);

    job.setInputFormatClass(inputFormatClass);
    job.setOutputFormatClass(outputFormatClass);

    job.setOutputKeyClass(outputKeyClass);
    job.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(job, otherArgs.get(0));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        job.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(job)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(conf, partitionFile);
        InputSampler.<K, V>writePartitionFile(job, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, conf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(job)[0] + " into " + FileOutputFormat.getOutputPath(job) + " with "
            + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return ret;
}