Example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat INPUT_DIR

List of usage examples for org.apache.hadoop.mapreduce.lib.input FileInputFormat INPUT_DIR

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat INPUT_DIR.

Prototype

String INPUT_DIR

To view the source code for org.apache.hadoop.mapreduce.lib.input FileInputFormat INPUT_DIR.

Click Source Link

Usage

From source file:co.cask.cdap.data2.dataset2.lib.file.FileSetDataset.java

License:Apache License

@Override
public Map<String, String> getInputFormatConfiguration(Iterable<? extends Location> inputLocs) {
    ImmutableMap.Builder<String, String> config = ImmutableMap.builder();
    config.putAll(FileSetProperties.getInputProperties(spec.getProperties()));
    config.putAll(FileSetProperties.getInputProperties(runtimeArguments));
    String inputs = Joiner.on(',').join(Iterables.transform(inputLocs, new Function<Location, String>() {
        @Override// ww w.  j  a  va  2s  . c  o  m
        public String apply(@Nullable Location location) {
            return getFileSystemPath(location);
        }
    }));
    config.put(FileInputFormat.INPUT_DIR, inputs);
    return config.build();
}

From source file:co.cask.cdap.data2.dataset2.lib.partitioned.TimePartitionedFileSetTest.java

License:Apache License

/**
 * Validates that the output configuration of the tpfs, when instantiated with (time - start * minutes) as
 * input start time and (time + end * minutes) as input end time, returns the expected list of paths.
 *//* w  ww . j a v a  2  s.co m*/
private void validateInputPaths(long time, long start, long end, final String... expected)
        throws IOException, DatasetManagementException, InterruptedException, TransactionFailureException {
    Map<String, String> arguments = Maps.newHashMap();
    TimePartitionedFileSetArguments.setInputStartTime(arguments, time + start * MINUTE);
    TimePartitionedFileSetArguments.setInputEndTime(arguments, time + end * MINUTE);
    final TimePartitionedFileSet tpfs = dsFrameworkUtil.getInstance(TPFS_INSTANCE, arguments);
    TransactionAware txAwareDataset = (TransactionAware) tpfs;
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset)
            .execute(new TransactionExecutor.Subroutine() {
                @Override
                public void apply() throws Exception {
                    Map<String, String> inputConfig = tpfs.getInputFormatConfiguration();
                    String inputs = inputConfig.get(FileInputFormat.INPUT_DIR);
                    Assert.assertNotNull(inputs);
                    if (expected.length == 0) {
                        Assert.assertTrue(inputs.isEmpty());
                        return;
                    }
                    String[] inputPaths = inputs.split(",");
                    Assert.assertEquals(expected.length, inputPaths.length);
                    // order is not guaranteed.
                    Arrays.sort(expected);
                    Arrays.sort(inputPaths);
                    for (int i = 0; i < expected.length; i++) {
                        // every input path is absolute, whereas expected paths are relative
                        Assert.assertTrue("path #" + i + " does not match",
                                inputPaths[i].endsWith(expected[i]));
                    }
                }
            });
}

From source file:co.cask.cdap.data2.dataset2.lib.partitioned.TimePartitionedFileSetTest.java

License:Apache License

private void testInputConfiguration(Map<String, String> arguments, final String expectedPath) throws Exception {
    final TimePartitionedFileSet dataset = dsFrameworkUtil.getInstance(TPFS_INSTANCE, arguments);
    TransactionAware txAwareDataset = (TransactionAware) dataset;
    dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset)
            .execute(new TransactionExecutor.Subroutine() {
                @Override//from  www .j a v a 2  s .  c  o  m
                public void apply() throws Exception {
                    Map<String, String> inputConf = dataset.getInputFormatConfiguration();
                    String input = inputConf.get(FileInputFormat.INPUT_DIR);
                    Assert.assertNotNull(input);
                    String[] inputs = input.split(",");
                    Assert.assertEquals(1, inputs.length);
                    Assert.assertTrue(inputs[0].endsWith(expectedPath));
                }
            });
}

From source file:com.anhth12.lambda.BatchUpdateFunction.java

@Override
public Void call(JavaPairRDD<K, M> newData, Time timestamp) throws Exception {
    if (newData.take(1).isEmpty()) {
        log.info("No data in current generation's RDD; nothing to do");
        return null;
    }//  ww  w.  j a  v  a  2 s.  co  m

    log.info("Beginning update at {}", timestamp);

    Configuration hadoopConf = sparkContext.hadoopConfiguration();

    JavaPairRDD<K, M> pastData;
    Path inputPathPattern = new Path(dataDirString + "/*/part-*");
    FileSystem fs = FileSystem.get(hadoopConf);
    FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern);

    if (inputPathStatuses == null || inputPathStatuses.length == 0) {
        log.info("No past data at path(s) {}", inputPathPattern);
        pastData = null;
    } else {
        log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath());
        Configuration updatedConf = new Configuration(hadoopConf);
        updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses));
        JavaPairRDD<Writable, Writable> pastWriteableData = (JavaPairRDD<Writable, Writable>) sparkContext
                .newAPIHadoopRDD(updatedConf, SequenceFileInputFilter.class, keyWritableClass,
                        messageWritableClass);
        pastData = pastWriteableData.mapToPair(
                new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass));

    }

    try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic)) {
        updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString,
                producer);
    }
    return null;
}

From source file:com.anhth12.lambda.BatchUpdateFunction2.java

@Override
public Void call(JavaRDD<MessageAndMetadata> newData, Time timestamp) throws Exception {
    if (newData.take(1).isEmpty()) {
        log.info("No data in current generation's RDD; nothing to do");
        return null;
    }/*from   w  ww .  j av a  2s. c o m*/

    log.info("Beginning update at {}", timestamp);

    JavaPairRDD<K, M> newDataKM = newData.mapToPair(new PairFunction<MessageAndMetadata, K, M>() {

        @Override
        public Tuple2<K, M> call(MessageAndMetadata t) throws Exception {

            return (Tuple2<K, M>) new Tuple2<>(new String(t.getKey()), new String(t.getPayload()));
        }
    });

    Configuration hadoopConf = sparkContext.hadoopConfiguration();

    JavaPairRDD<K, M> pastData;
    Path inputPathPattern = new Path(dataDirString + "/*/part-*");
    FileSystem fs = FileSystem.get(hadoopConf);
    FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern);

    if (inputPathStatuses == null || inputPathStatuses.length == 0) {
        log.info("No past data at path(s) {}", inputPathPattern);
        pastData = null;
    } else {
        log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath());
        Configuration updatedConf = new Configuration(hadoopConf);
        updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses));
        JavaPairRDD<Writable, Writable> pastWriteableData = (JavaPairRDD<Writable, Writable>) sparkContext
                .newAPIHadoopRDD(updatedConf, SequenceFileInputFilter.class, keyWritableClass,
                        messageWritableClass);
        pastData = pastWriteableData.mapToPair(
                new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass));

    }
    try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic)) {
        updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newDataKM, pastData, modelDirString,
                producer);
    }
    return null;

}

From source file:com.cloudera.oryx.lambda.batch.BatchUpdateFunction.java

License:Open Source License

@Override
public void call(JavaPairRDD<K, M> newData, Time timestamp) throws IOException, InterruptedException {

    if (newData.isEmpty()) {
        log.info("No data in current generation's RDD; nothing to do");
        return;//from ww  w.j  a  v a2  s  . c  o  m
    }

    log.info("Beginning update at {}", timestamp);

    Configuration hadoopConf = sparkContext.hadoopConfiguration();
    if (hadoopConf.getResource("core-site.xml") == null) {
        log.warn("Hadoop config like core-site.xml was not found; "
                + "is the Hadoop config directory on the classpath?");
    }

    JavaPairRDD<K, M> pastData;
    Path inputPathPattern = new Path(dataDirString + "/*/part-*");
    FileSystem fs = FileSystem.get(inputPathPattern.toUri(), hadoopConf);
    FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern);
    if (inputPathStatuses == null || inputPathStatuses.length == 0) {

        log.info("No past data at path(s) {}", inputPathPattern);
        pastData = null;

    } else {

        log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath());
        Configuration updatedConf = new Configuration(hadoopConf);
        updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses));

        @SuppressWarnings("unchecked")
        JavaPairRDD<Writable, Writable> pastWritableData = (JavaPairRDD<Writable, Writable>) sparkContext
                .newAPIHadoopRDD(updatedConf, SequenceFileInputFormat.class, keyWritableClass,
                        messageWritableClass);

        pastData = pastWritableData.mapToPair(
                new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass));
    }

    if (updateTopic == null || updateBroker == null) {
        log.info("Not producing updates to update topic since none was configured");
        updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString,
                null);
    } else {
        // This TopicProducer should not be async; sends one big model generally and
        // needs to occur before other updates reliably rather than be buffered
        try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic, false)) {
            updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString,
                    producer);
        }
    }
}

From source file:com.cloudera.oryx.lambda.BatchUpdateFunction.java

License:Open Source License

@Override
public Void call(JavaPairRDD<K, M> newData, Time timestamp) throws IOException, InterruptedException {

    Configuration hadoopConf = sparkContext.hadoopConfiguration();

    JavaPairRDD<K, M> pastData;/*from  w ww . j  a va  2s  . co  m*/
    Path inputPathPattern = new Path(dataDirString + "/*/part-*");
    FileSystem fs = FileSystem.get(hadoopConf);
    FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern);
    if (inputPathStatuses == null || inputPathStatuses.length == 0) {

        log.info("No past data at path(s) {}", inputPathPattern);
        pastData = null;

    } else {

        log.info("Found past data at path(s) like {} , ...", inputPathStatuses[0].getPath());
        Configuration updatedConf = new Configuration(hadoopConf);
        updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses));

        @SuppressWarnings("unchecked")
        JavaPairRDD<Writable, Writable> pastWritableData = (JavaPairRDD<Writable, Writable>) sparkContext
                .newAPIHadoopRDD(updatedConf, SequenceFileInputFormat.class, keyWritableClass,
                        messageWritableClass);

        pastData = pastWritableData.mapToPair(
                new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass));
    }

    try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic)) {
        updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString,
                producer);
    }

    return null;
}

From source file:com.cloudera.recordservice.mapreduce.MapReduceTest.java

License:Apache License

private void verifyInputSplitsPath(int numSplits, int numCols, String path) throws IOException {
    Configuration config = new Configuration();
    config.set(FileInputFormat.INPUT_DIR, path);
    verifyInputSplits(numSplits, numCols, config);
}

From source file:com.cloudera.recordservice.mapreduce.MapReduceTest.java

License:Apache License

@Test
public void testGetSplits() throws IOException {
    Configuration config = new Configuration();

    boolean exceptionThrown = false;
    try {//from  w  ww  .  j a v a 2  s  .  c o m
        PlanUtil.getSplits(config, new Credentials());
    } catch (IllegalArgumentException e) {
        exceptionThrown = true;
        assertTrue(e.getMessage().contains("No input specified"));
    }
    assertTrue(exceptionThrown);

    // Set db/table and make sure it works.
    config.set(ConfVars.TBL_NAME_CONF.name, "tpch.nation");
    PlanUtil.getSplits(config, new Credentials());

    // Also set input. This should fail.
    config.set(FileInputFormat.INPUT_DIR, "/test");
    exceptionThrown = false;
    try {
        PlanUtil.getSplits(config, new Credentials());
    } catch (IllegalArgumentException e) {
        exceptionThrown = true;
        assertTrue(e.getMessage(), e.getMessage().contains("More than one input specified"));
    }
    assertTrue(exceptionThrown);

    // Unset the table and set columns. INPUT_DIR and columns don't work now.
    config.unset(ConfVars.TBL_NAME_CONF.name);
    config.setStrings(ConfVars.COL_NAMES_CONF.name, "a");
    exceptionThrown = false;
    try {
        PlanUtil.getSplits(config, new Credentials());
    } catch (IllegalArgumentException e) {
        exceptionThrown = true;
        assertTrue(e.getMessage().contains("Column projections can only be specified with table inputs."));
    }
    assertTrue(exceptionThrown);

    // Test some cases that work
    verifyInputSplitsTable(1, 4, "tpch.nation");
    verifyInputSplitsTable(2, 12, "rs.alltypes");
    verifyInputSplitsTable(1, 1, "tpch.nation", "n_name");
    verifyInputSplitsTable(2, 3, "rs.alltypes", "int_col", "double_col", "string_col");
    verifyInputSplitsPath(1, 1, "/test-warehouse/tpch.nation");

    // Test some cases using the config utility.
    config.clear();
    RecordServiceConfig.setInputTable(config, null, "tpch.nation", "n_nationkey", "n_comment");
    verifyInputSplits(1, 2, config);

    exceptionThrown = false;
    try {
        verifyInputSplitsTable(1, 1, "tpch.nation", "bad");
    } catch (IOException e) {
        exceptionThrown = true;
        assertTrue(e.getCause() instanceof RecordServiceException);
        RecordServiceException ex = (RecordServiceException) e.getCause();
        assertEquals(RecordServiceException.ErrorCode.INVALID_REQUEST, ex.code);
    }
    assertTrue(exceptionThrown);

    exceptionThrown = false;
    try {
        verifyInputSplitsPath(1, 1, "/test-warehouse/tpch.nation,/test-warehouse/tpch.nation");
    } catch (IllegalArgumentException e) {
        exceptionThrown = true;
        assertTrue(e.getMessage().contains("Only reading a single directory is currently supported."));
    }
    assertTrue(exceptionThrown);
}

From source file:com.cloudera.recordservice.mr.PlanUtil.java

License:Apache License

/**
 * Generates a request from the configs set in jobConf.
 *//*from w ww .  j  a v a  2s .c  o  m*/
public static Request getRequest(Configuration jobConf) throws IOException {
    LOG.debug("Generating input splits.");

    String tblName = jobConf.get(ConfVars.TBL_NAME_CONF.name);
    String inputDir = jobConf.get(FileInputFormat.INPUT_DIR);
    String sqlQuery = jobConf.get(ConfVars.QUERY_NAME_CONF.name);

    int numSet = 0;
    if (tblName != null)
        ++numSet;
    if (inputDir != null)
        ++numSet;
    if (sqlQuery != null)
        ++numSet;

    if (numSet == 0) {
        throw new IllegalArgumentException("No input specified. Specify either '" + ConfVars.TBL_NAME_CONF.name
                + "', '" + ConfVars.QUERY_NAME_CONF.name + "' or '" + FileInputFormat.INPUT_DIR + "'");
    }
    if (numSet > 1) {
        throw new IllegalArgumentException("More than one input specified. Can " + "only specify one of '"
                + ConfVars.TBL_NAME_CONF.name + "'=" + tblName + ", '" + FileInputFormat.INPUT_DIR + "'="
                + inputDir + ", '" + ConfVars.QUERY_NAME_CONF.name + "'=" + sqlQuery);
    }

    String[] colNames = jobConf.getStrings(ConfVars.COL_NAMES_CONF.name);
    if (colNames == null)
        colNames = new String[0];

    if (tblName == null && colNames.length > 0) {
        // TODO: support this.
        throw new IllegalArgumentException("Column projections can only be specified with table inputs.");
    }

    Request request = null;
    if (tblName != null) {
        if (colNames.length == 0) {
            // If length of colNames = 0, return all possible columns
            // TODO: this has slightly different meaning than createProjectionRequest()
            // which treats empty columns as an empty projection. i.e. select * vs count(*)
            // Reconcile this.
            request = Request.createTableScanRequest(tblName);
        } else {
            List<String> projection = new ArrayList<String>();
            for (String c : colNames) {
                if (c == null || c.isEmpty()) {
                    throw new IllegalArgumentException(
                            "Cannot specify projection with null or empty column name.");
                }
                projection.add(c);
            }
            request = Request.createProjectionRequest(tblName, projection);
        }
    } else if (inputDir != null) {
        // TODO: inputDir is a comma separate list of paths. The service needs to
        // handle that.
        if (inputDir.contains(",")) {
            throw new IllegalArgumentException("Only reading a single directory is currently supported.");
        }
        request = Request.createPathRequest(inputDir);
    } else if (sqlQuery != null) {
        request = Request.createSqlRequest(sqlQuery);
    } else {
        Preconditions.checkState(false);
    }
    return request;
}