Example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat getInputPaths

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat getInputPaths.

Prototype

public static Path[] getInputPaths(JobContext context)

Source Link

Document

Get the list of input Path s for the map-reduce job.

Usage

From source file:alluxio.hadoop.mapreduce.KeyValueInputFormat.java

License:Apache License

/**
 * Returns a list of {@link KeyValueInputSplit} where each split is one key-value partition.
 *
 * @param jobContext MapReduce job configuration
 * @return list of {@link InputSplit}s, each split is a partition
 * @throws IOException if information about the partition cannot be retrieved
 *//*from  w  w w  .  j a  v  a 2s.c  o m*/
@Override
public List<InputSplit> getSplits(JobContext jobContext) throws IOException {
    // The paths are MapReduce program's inputs specified in
    // {@code mapreduce.input.fileinputformat.inputdir}, each path should be a key-value store.
    Path[] paths = FileInputFormat.getInputPaths(jobContext);
    List<InputSplit> splits = new ArrayList<>();
    try {
        for (Path path : paths) {
            List<PartitionInfo> partitionInfos = mKeyValueMasterClient
                    .getPartitionInfo(new AlluxioURI(path.toString()));
            for (PartitionInfo partitionInfo : partitionInfos) {
                splits.add(new KeyValueInputSplit(partitionInfo));
            }
        }
    } catch (AlluxioException e) {
        throw new IOException(e);
    }
    return splits;
}

From source file:bdss.cmu.edu.Sort.java

License:Apache License

/**
 * The main driver for sort program./*from   www .  j  a  va 2s  .co m*/
 * Invoke this method to submit the map/reduce job.
 * @throws IOException When there is communication problems with the
 *                     job tracker.
 */
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = conf.get(REDUCES_PER_HOST);
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }
    // Set user-supplied (possibly default) job configs
    job = new Job(conf);
    job.setJobName("sorter");
    job.setJarByClass(Sort.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(Reducer.class);

    job.setNumReduceTasks(num_reduces);

    job.setInputFormatClass(inputFormatClass);
    job.setOutputFormatClass(outputFormatClass);

    job.setOutputKeyClass(outputKeyClass);
    job.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(job, otherArgs.get(0));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        job.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(job)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), partitionFile);
        InputSampler.<K, V>writePartitionFile(job, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, conf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(job)[0] + " into " + FileOutputFormat.getOutputPath(job) + " with "
            + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return ret;
}

From source file:co.cask.cdap.hive.datasets.DatasetInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {
    try (DatasetAccessor datasetAccessor = new DatasetAccessor(jobConf)) {
        try {//from w  ww  . j a  v a 2 s .  co m
            datasetAccessor.initialize();
        } catch (Exception e) {
            throw new IOException("Could not get dataset", e);
        }
        try (RecordScannable recordScannable = datasetAccessor.getDataset()) {
            Job job = new Job(jobConf);
            JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
            Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

            List<Split> dsSplits = recordScannable.getSplits();

            InputSplit[] inputSplits = new InputSplit[dsSplits.size()];
            for (int i = 0; i < dsSplits.size(); i++) {
                inputSplits[i] = new DatasetInputSplit(dsSplits.get(i), tablePaths[0]);
            }
            return inputSplits;
        }
    }
}

From source file:co.cask.cdap.hive.stream.HiveStreamInputFormat.java

License:Apache License

private StreamInputSplitFinder<InputSplit> getSplitFinder(JobConf conf) throws IOException {
    // first get the context we are in
    ContextManager.Context context = ContextManager.getContext(conf);

    String streamName = conf.get(Constants.Explore.STREAM_NAME);
    String streamNamespace = conf.get(Constants.Explore.STREAM_NAMESPACE);
    Id.Stream streamId = Id.Stream.from(streamNamespace, streamName);
    StreamConfig streamConfig = context.getStreamConfig(streamId);
    // make sure we get the current generation so we don't read events that occurred before a truncate.
    Location streamPath = StreamUtils.createGenerationLocation(streamConfig.getLocation(),
            StreamUtils.getGeneration(streamConfig));

    StreamInputSplitFinder.Builder builder = StreamInputSplitFinder.builder(streamPath.toURI());

    // Get the Hive table path for the InputSplit created. It is just to satisfy hive. The InputFormat never uses it.
    JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(Job.getInstance(conf));
    final Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

    return setupBuilder(conf, streamConfig, builder).build(new StreamInputSplitFactory<InputSplit>() {
        @Override//w  w w.ja v  a2  s .  c om
        public InputSplit createSplit(Path eventPath, Path indexPath, long startTime, long endTime, long start,
                long length, @Nullable String[] locations) {
            return new StreamInputSplit(tablePaths[0], eventPath, indexPath, startTime, endTime, start, length,
                    locations);
        }
    });
}

From source file:com.aliyun.openservices.tablestore.hive.TableStoreInputFormat.java

License:Apache License

@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    Configuration dest = translateConfig(job);
    SyncClientInterface ots = null;//w ww . jav a 2s .  c  o  m
    String columns = job.get(TableStoreConsts.COLUMNS_MAPPING);
    if (columns == null) {
        columns = job.get(serdeConstants.LIST_COLUMNS);
    }
    logger.debug("columns to get: {}", columns);
    List<org.apache.hadoop.mapreduce.InputSplit> splits;
    try {
        ots = TableStore.newOtsClient(dest);
        TableMeta meta = fetchTableMeta(ots, job.get(TableStoreConsts.TABLE_NAME));
        RangeRowQueryCriteria criteria = fetchCriteria(meta, columns);
        com.aliyun.openservices.tablestore.hadoop.TableStoreInputFormat.addCriteria(dest, criteria);
        splits = com.aliyun.openservices.tablestore.hadoop.TableStoreInputFormat.getSplits(dest, ots);
    } finally {
        if (ots != null) {
            ots.shutdown();
            ots = null;
        }
    }
    InputSplit[] res = new InputSplit[splits.size()];
    JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(new Job(job));
    Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);
    int i = 0;
    for (org.apache.hadoop.mapreduce.InputSplit split : splits) {
        res[i] = new TableStoreInputSplit(
                (com.aliyun.openservices.tablestore.hadoop.TableStoreInputSplit) split, tablePaths[0]);
        ++i;
    }
    return res;
}

From source file:com.asakusafw.runtime.stage.input.TemporaryInputFormat.java

License:Apache License

/**
 * Returns input paths./*  w  w  w .  jav a 2 s  .  c om*/
 * @param context current job
 * @return the input paths, or an empty list if they are not set
 * @throws IOException if failed to resolve paths
 * @throws IllegalArgumentException if some parameters were {@code null}
 * @since 0.7.0
 */
public static List<Path> getInputPaths(JobContext context) throws IOException {
    if (context == null) {
        throw new IllegalArgumentException("job must not be null"); //$NON-NLS-1$
    }
    Path[] paths = FileInputFormat.getInputPaths(context);
    if (paths == null || paths.length == 0) {
        return Collections.emptyList();
    }
    return Arrays.asList(paths);
}

From source file:com.ask.hive.hbase.HiveHBaseTextTableInputFormat.java

License:Apache License

public InputSplit[] getSplits(JobConf jobConf, int numSplits) throws IOException {

    String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
    setHTable(new HTable(new HBaseConfiguration(jobConf), Bytes.toBytes(hbaseTableName)));
    String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);

    if (hbaseColumnsMapping == null) {
        throw new IOException("hbase.columns.mapping required for HBase Table.");
    }//from  w  w  w  . ja  v  a2  s  .c o  m

    List<String> hbaseColumnFamilies = new ArrayList<String>();
    List<String> hbaseColumnQualifiers = new ArrayList<String>();
    List<byte[]> hbaseColumnFamiliesBytes = new ArrayList<byte[]>();
    List<byte[]> hbaseColumnQualifiersBytes = new ArrayList<byte[]>();

    int iKey;
    try {
        iKey = HBaseSerDe.parseColumnMapping(hbaseColumnsMapping, hbaseColumnFamilies, hbaseColumnFamiliesBytes,
                hbaseColumnQualifiers, hbaseColumnQualifiersBytes);
    } catch (SerDeException se) {
        throw new IOException(se);
    }

    Scan scan = new Scan();

    // Take filter pushdown into account while calculating splits; this
    // allows us to prune off regions immediately.  Note that although
    // the Javadoc for the superclass getSplits says that it returns one
    // split per region, the implementation actually takes the scan
    // definition into account and excludes regions which don't satisfy
    // the start/stop row conditions (HBASE-1829).
    convertFilter(jobConf, scan, null, iKey);

    // REVIEW:  are we supposed to be applying the getReadColumnIDs
    // same as in getRecordReader?
    for (int i = 0; i < hbaseColumnFamilies.size(); i++) {
        if (i == iKey) {
            continue;
        }

        if (hbaseColumnQualifiers.get(i) == null) {
            scan.addFamily(hbaseColumnFamiliesBytes.get(i));
        } else {
            scan.addColumn(hbaseColumnFamiliesBytes.get(i), hbaseColumnQualifiersBytes.get(i));
        }
    }

    setScan(scan);
    Job job = new Job(jobConf);
    JobContext jobContext = new JobContext(job.getConfiguration(), job.getJobID());
    Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

    List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext);
    InputSplit[] results = new InputSplit[splits.size()];

    for (int i = 0; i < splits.size(); i++) {
        results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]);
    }

    return results;
}

From source file:com.cloudera.castagna.logparser.Utils.java

License:Apache License

public static void log(Job job, Logger log) throws ClassNotFoundException {
    log.debug("{} -> {} ({}, {}) -> {}#{} ({}, {}) -> {}",
            new Object[] { job.getInputFormatClass().getSimpleName(), job.getMapperClass().getSimpleName(),
                    job.getMapOutputKeyClass().getSimpleName(), job.getMapOutputValueClass().getSimpleName(),
                    job.getReducerClass().getSimpleName(), job.getNumReduceTasks(),
                    job.getOutputKeyClass().getSimpleName(), job.getOutputValueClass().getSimpleName(),
                    job.getOutputFormatClass().getSimpleName() });
    Path[] inputs = FileInputFormat.getInputPaths(job);
    Path output = FileOutputFormat.getOutputPath(job);
    log.debug("input: {}", inputs[0]);
    log.debug("output: {}", output);
}

From source file:com.github.dryangkun.hbase.tidx.hive.HiveHBaseTableInputFormat.java

License:Apache License

private InputSplit[] getSplitsInternal(JobConf jobConf, int numSplits) throws IOException {

    //obtain delegation tokens for the job
    if (UserGroupInformation.getCurrentUser().hasKerberosCredentials()) {
        TableMapReduceUtil.initCredentials(jobConf);
    }//w  w  w.  j  a  va  2 s .  co m

    String hbaseTableName = jobConf.get(HBaseSerDe.HBASE_TABLE_NAME);
    String hbaseColumnsMapping = jobConf.get(HBaseSerDe.HBASE_COLUMNS_MAPPING);
    boolean doColumnRegexMatching = jobConf.getBoolean(HBaseSerDe.HBASE_COLUMNS_REGEX_MATCHING, true);

    if (hbaseColumnsMapping == null) {
        throw new IOException(HBaseSerDe.HBASE_COLUMNS_MAPPING + " required for HBase Table.");
    }

    ColumnMappings columnMappings = null;
    int iTimeColumn = -1;
    try {
        columnMappings = HBaseSerDe.parseColumnsMapping(hbaseColumnsMapping, doColumnRegexMatching);
        iTimeColumn = HBaseSerDe.getTxTimeColumnIndex(columnMappings, jobConf);
    } catch (SerDeException e) {
        throw new IOException(e);
    }

    int iKey = columnMappings.getKeyIndex();
    int iTimestamp = columnMappings.getTimestampIndex();
    ColumnMapping keyMapping = columnMappings.getKeyMapping();

    if (iTimeColumn != -1) {
        List<org.apache.hadoop.mapreduce.InputSplit> splits = TxHiveTableInputFormatUtil.getSplits(jobConf,
                numSplits, columnMappings, iTimeColumn, hbaseTableName);
        if (splits != null) {
            Job job = new Job(jobConf);
            JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
            Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

            InputSplit[] results = new InputSplit[splits.size()];
            for (int i = 0; i < splits.size(); i++) {
                results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0], true);
            }
            LOG.info("getSplits: TxHiveIndexScan");
            return results;
        }
    }
    LOG.info("getSplits: no TxHiveIndexScan");

    setHTable(new HTable(HBaseConfiguration.create(jobConf), Bytes.toBytes(hbaseTableName)));
    // Take filter pushdown into account while calculating splits; this
    // allows us to prune off regions immediately.  Note that although
    // the Javadoc for the superclass getSplits says that it returns one
    // split per region, the implementation actually takes the scan
    // definition into account and excludes regions which don't satisfy
    // the start/stop row conditions (HBASE-1829).
    Scan scan = createFilterScan(jobConf, iKey, iTimestamp, HiveHBaseInputFormatUtil.getStorageFormatOfKey(
            keyMapping.mappingSpec, jobConf.get(HBaseSerDe.HBASE_TABLE_DEFAULT_STORAGE_TYPE, "string")));

    // The list of families that have been added to the scan
    List<String> addedFamilies = new ArrayList<String>();

    // REVIEW:  are we supposed to be applying the getReadColumnIDs
    // same as in getRecordReader?
    for (ColumnMapping colMap : columnMappings) {
        if (colMap.hbaseRowKey || colMap.hbaseTimestamp) {
            continue;
        }

        if (colMap.qualifierName == null) {
            scan.addFamily(colMap.familyNameBytes);
            addedFamilies.add(colMap.familyName);
        } else {
            if (!addedFamilies.contains(colMap.familyName)) {
                // add the column only if the family has not already been added
                scan.addColumn(colMap.familyNameBytes, colMap.qualifierNameBytes);
            }
        }
    }
    setScan(scan);

    Job job = new Job(jobConf);
    JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
    Path[] tablePaths = FileInputFormat.getInputPaths(jobContext);

    List<org.apache.hadoop.mapreduce.InputSplit> splits = super.getSplits(jobContext);
    InputSplit[] results = new InputSplit[splits.size()];

    for (int i = 0; i < splits.size(); i++) {
        results[i] = new HBaseSplit((TableSplit) splits.get(i), tablePaths[0]);
    }

    return results;
}

From source file:com.phantom.hadoop.examples.Sort.java

License:Apache License

/**
 * The main driver for sort program. Invoke this method to submit the
 * map/reduce job./*from w w w  .  ja  v  a 2  s .  c o  m*/
 * 
 * @throws IOException
 *             When there is communication problems with the job tracker.
 */
public int run(String[] args) throws Exception {

    Configuration conf = getConf();
    JobClient client = new JobClient(conf);
    ClusterStatus cluster = client.getClusterStatus();
    int num_reduces = (int) (cluster.getMaxReduceTasks() * 0.9);
    String sort_reduces = conf.get(REDUCES_PER_HOST);
    if (sort_reduces != null) {
        num_reduces = cluster.getTaskTrackers() * Integer.parseInt(sort_reduces);
    }
    Class<? extends InputFormat> inputFormatClass = SequenceFileInputFormat.class;
    Class<? extends OutputFormat> outputFormatClass = SequenceFileOutputFormat.class;
    Class<? extends WritableComparable> outputKeyClass = BytesWritable.class;
    Class<? extends Writable> outputValueClass = BytesWritable.class;
    List<String> otherArgs = new ArrayList<String>();
    InputSampler.Sampler<K, V> sampler = null;
    for (int i = 0; i < args.length; ++i) {
        try {
            if ("-r".equals(args[i])) {
                num_reduces = Integer.parseInt(args[++i]);
            } else if ("-inFormat".equals(args[i])) {
                inputFormatClass = Class.forName(args[++i]).asSubclass(InputFormat.class);
            } else if ("-outFormat".equals(args[i])) {
                outputFormatClass = Class.forName(args[++i]).asSubclass(OutputFormat.class);
            } else if ("-outKey".equals(args[i])) {
                outputKeyClass = Class.forName(args[++i]).asSubclass(WritableComparable.class);
            } else if ("-outValue".equals(args[i])) {
                outputValueClass = Class.forName(args[++i]).asSubclass(Writable.class);
            } else if ("-totalOrder".equals(args[i])) {
                double pcnt = Double.parseDouble(args[++i]);
                int numSamples = Integer.parseInt(args[++i]);
                int maxSplits = Integer.parseInt(args[++i]);
                if (0 >= maxSplits)
                    maxSplits = Integer.MAX_VALUE;
                sampler = new InputSampler.RandomSampler<K, V>(pcnt, numSamples, maxSplits);
            } else {
                otherArgs.add(args[i]);
            }
        } catch (NumberFormatException except) {
            System.out.println("ERROR: Integer expected instead of " + args[i]);
            return printUsage();
        } catch (ArrayIndexOutOfBoundsException except) {
            System.out.println("ERROR: Required parameter missing from " + args[i - 1]);
            return printUsage(); // exits
        }
    }
    // Set user-supplied (possibly default) job configs
    job = new Job(conf);
    job.setJobName("sorter");
    job.setJarByClass(Sort.class);

    job.setMapperClass(Mapper.class);
    job.setReducerClass(Reducer.class);

    job.setNumReduceTasks(num_reduces);

    job.setInputFormatClass(inputFormatClass);
    job.setOutputFormatClass(outputFormatClass);

    job.setOutputKeyClass(outputKeyClass);
    job.setOutputValueClass(outputValueClass);

    // Make sure there are exactly 2 parameters left.
    if (otherArgs.size() != 2) {
        System.out.println("ERROR: Wrong number of parameters: " + otherArgs.size() + " instead of 2.");
        return printUsage();
    }
    FileInputFormat.setInputPaths(job, otherArgs.get(0));
    FileOutputFormat.setOutputPath(job, new Path(otherArgs.get(1)));

    if (sampler != null) {
        System.out.println("Sampling input to effect total-order sort...");
        job.setPartitionerClass(TotalOrderPartitioner.class);
        Path inputDir = FileInputFormat.getInputPaths(job)[0];
        inputDir = inputDir.makeQualified(inputDir.getFileSystem(conf));
        Path partitionFile = new Path(inputDir, "_sortPartitioning");
        TotalOrderPartitioner.setPartitionFile(conf, partitionFile);
        InputSampler.<K, V>writePartitionFile(job, sampler);
        URI partitionUri = new URI(partitionFile.toString() + "#" + "_sortPartitioning");
        DistributedCache.addCacheFile(partitionUri, conf);
    }

    System.out.println("Running on " + cluster.getTaskTrackers() + " nodes to sort from "
            + FileInputFormat.getInputPaths(job)[0] + " into " + FileOutputFormat.getOutputPath(job) + " with "
            + num_reduces + " reduces.");
    Date startTime = new Date();
    System.out.println("Job started: " + startTime);
    int ret = job.waitForCompletion(true) ? 0 : 1;
    Date end_time = new Date();
    System.out.println("Job ended: " + end_time);
    System.out.println("The job took " + (end_time.getTime() - startTime.getTime()) / 1000 + " seconds.");
    return ret;
}