List of usage examples for org.apache.hadoop.mapreduce.lib.input FileInputFormat INPUT_DIR
String INPUT_DIR
To view the source code for org.apache.hadoop.mapreduce.lib.input FileInputFormat INPUT_DIR.
Click Source Link
From source file:co.cask.cdap.data2.dataset2.lib.file.FileSetDataset.java
License:Apache License
@Override public Map<String, String> getInputFormatConfiguration(Iterable<? extends Location> inputLocs) { ImmutableMap.Builder<String, String> config = ImmutableMap.builder(); config.putAll(FileSetProperties.getInputProperties(spec.getProperties())); config.putAll(FileSetProperties.getInputProperties(runtimeArguments)); String inputs = Joiner.on(',').join(Iterables.transform(inputLocs, new Function<Location, String>() { @Override// ww w. j a va 2s . c o m public String apply(@Nullable Location location) { return getFileSystemPath(location); } })); config.put(FileInputFormat.INPUT_DIR, inputs); return config.build(); }
From source file:co.cask.cdap.data2.dataset2.lib.partitioned.TimePartitionedFileSetTest.java
License:Apache License
/** * Validates that the output configuration of the tpfs, when instantiated with (time - start * minutes) as * input start time and (time + end * minutes) as input end time, returns the expected list of paths. *//* w ww . j a v a 2 s.co m*/ private void validateInputPaths(long time, long start, long end, final String... expected) throws IOException, DatasetManagementException, InterruptedException, TransactionFailureException { Map<String, String> arguments = Maps.newHashMap(); TimePartitionedFileSetArguments.setInputStartTime(arguments, time + start * MINUTE); TimePartitionedFileSetArguments.setInputEndTime(arguments, time + end * MINUTE); final TimePartitionedFileSet tpfs = dsFrameworkUtil.getInstance(TPFS_INSTANCE, arguments); TransactionAware txAwareDataset = (TransactionAware) tpfs; dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset) .execute(new TransactionExecutor.Subroutine() { @Override public void apply() throws Exception { Map<String, String> inputConfig = tpfs.getInputFormatConfiguration(); String inputs = inputConfig.get(FileInputFormat.INPUT_DIR); Assert.assertNotNull(inputs); if (expected.length == 0) { Assert.assertTrue(inputs.isEmpty()); return; } String[] inputPaths = inputs.split(","); Assert.assertEquals(expected.length, inputPaths.length); // order is not guaranteed. Arrays.sort(expected); Arrays.sort(inputPaths); for (int i = 0; i < expected.length; i++) { // every input path is absolute, whereas expected paths are relative Assert.assertTrue("path #" + i + " does not match", inputPaths[i].endsWith(expected[i])); } } }); }
From source file:co.cask.cdap.data2.dataset2.lib.partitioned.TimePartitionedFileSetTest.java
License:Apache License
private void testInputConfiguration(Map<String, String> arguments, final String expectedPath) throws Exception { final TimePartitionedFileSet dataset = dsFrameworkUtil.getInstance(TPFS_INSTANCE, arguments); TransactionAware txAwareDataset = (TransactionAware) dataset; dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset) .execute(new TransactionExecutor.Subroutine() { @Override//from www .j a v a 2 s . c o m public void apply() throws Exception { Map<String, String> inputConf = dataset.getInputFormatConfiguration(); String input = inputConf.get(FileInputFormat.INPUT_DIR); Assert.assertNotNull(input); String[] inputs = input.split(","); Assert.assertEquals(1, inputs.length); Assert.assertTrue(inputs[0].endsWith(expectedPath)); } }); }
From source file:com.anhth12.lambda.BatchUpdateFunction.java
@Override public Void call(JavaPairRDD<K, M> newData, Time timestamp) throws Exception { if (newData.take(1).isEmpty()) { log.info("No data in current generation's RDD; nothing to do"); return null; }// ww w. j a v a 2 s. co m log.info("Beginning update at {}", timestamp); Configuration hadoopConf = sparkContext.hadoopConfiguration(); JavaPairRDD<K, M> pastData; Path inputPathPattern = new Path(dataDirString + "/*/part-*"); FileSystem fs = FileSystem.get(hadoopConf); FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern); if (inputPathStatuses == null || inputPathStatuses.length == 0) { log.info("No past data at path(s) {}", inputPathPattern); pastData = null; } else { log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath()); Configuration updatedConf = new Configuration(hadoopConf); updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses)); JavaPairRDD<Writable, Writable> pastWriteableData = (JavaPairRDD<Writable, Writable>) sparkContext .newAPIHadoopRDD(updatedConf, SequenceFileInputFilter.class, keyWritableClass, messageWritableClass); pastData = pastWriteableData.mapToPair( new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)); } try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic)) { updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString, producer); } return null; }
From source file:com.anhth12.lambda.BatchUpdateFunction2.java
@Override public Void call(JavaRDD<MessageAndMetadata> newData, Time timestamp) throws Exception { if (newData.take(1).isEmpty()) { log.info("No data in current generation's RDD; nothing to do"); return null; }/*from w ww . j av a 2s. c o m*/ log.info("Beginning update at {}", timestamp); JavaPairRDD<K, M> newDataKM = newData.mapToPair(new PairFunction<MessageAndMetadata, K, M>() { @Override public Tuple2<K, M> call(MessageAndMetadata t) throws Exception { return (Tuple2<K, M>) new Tuple2<>(new String(t.getKey()), new String(t.getPayload())); } }); Configuration hadoopConf = sparkContext.hadoopConfiguration(); JavaPairRDD<K, M> pastData; Path inputPathPattern = new Path(dataDirString + "/*/part-*"); FileSystem fs = FileSystem.get(hadoopConf); FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern); if (inputPathStatuses == null || inputPathStatuses.length == 0) { log.info("No past data at path(s) {}", inputPathPattern); pastData = null; } else { log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath()); Configuration updatedConf = new Configuration(hadoopConf); updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses)); JavaPairRDD<Writable, Writable> pastWriteableData = (JavaPairRDD<Writable, Writable>) sparkContext .newAPIHadoopRDD(updatedConf, SequenceFileInputFilter.class, keyWritableClass, messageWritableClass); pastData = pastWriteableData.mapToPair( new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)); } try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic)) { updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newDataKM, pastData, modelDirString, producer); } return null; }
From source file:com.cloudera.oryx.lambda.batch.BatchUpdateFunction.java
License:Open Source License
@Override public void call(JavaPairRDD<K, M> newData, Time timestamp) throws IOException, InterruptedException { if (newData.isEmpty()) { log.info("No data in current generation's RDD; nothing to do"); return;//from ww w.j a v a2 s . c o m } log.info("Beginning update at {}", timestamp); Configuration hadoopConf = sparkContext.hadoopConfiguration(); if (hadoopConf.getResource("core-site.xml") == null) { log.warn("Hadoop config like core-site.xml was not found; " + "is the Hadoop config directory on the classpath?"); } JavaPairRDD<K, M> pastData; Path inputPathPattern = new Path(dataDirString + "/*/part-*"); FileSystem fs = FileSystem.get(inputPathPattern.toUri(), hadoopConf); FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern); if (inputPathStatuses == null || inputPathStatuses.length == 0) { log.info("No past data at path(s) {}", inputPathPattern); pastData = null; } else { log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath()); Configuration updatedConf = new Configuration(hadoopConf); updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses)); @SuppressWarnings("unchecked") JavaPairRDD<Writable, Writable> pastWritableData = (JavaPairRDD<Writable, Writable>) sparkContext .newAPIHadoopRDD(updatedConf, SequenceFileInputFormat.class, keyWritableClass, messageWritableClass); pastData = pastWritableData.mapToPair( new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)); } if (updateTopic == null || updateBroker == null) { log.info("Not producing updates to update topic since none was configured"); updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString, null); } else { // This TopicProducer should not be async; sends one big model generally and // needs to occur before other updates reliably rather than be buffered try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic, false)) { updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString, producer); } } }
From source file:com.cloudera.oryx.lambda.BatchUpdateFunction.java
License:Open Source License
@Override public Void call(JavaPairRDD<K, M> newData, Time timestamp) throws IOException, InterruptedException { Configuration hadoopConf = sparkContext.hadoopConfiguration(); JavaPairRDD<K, M> pastData;/*from w ww . j a va 2s . co m*/ Path inputPathPattern = new Path(dataDirString + "/*/part-*"); FileSystem fs = FileSystem.get(hadoopConf); FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern); if (inputPathStatuses == null || inputPathStatuses.length == 0) { log.info("No past data at path(s) {}", inputPathPattern); pastData = null; } else { log.info("Found past data at path(s) like {} , ...", inputPathStatuses[0].getPath()); Configuration updatedConf = new Configuration(hadoopConf); updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses)); @SuppressWarnings("unchecked") JavaPairRDD<Writable, Writable> pastWritableData = (JavaPairRDD<Writable, Writable>) sparkContext .newAPIHadoopRDD(updatedConf, SequenceFileInputFormat.class, keyWritableClass, messageWritableClass); pastData = pastWritableData.mapToPair( new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)); } try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic)) { updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString, producer); } return null; }
From source file:com.cloudera.recordservice.mapreduce.MapReduceTest.java
License:Apache License
private void verifyInputSplitsPath(int numSplits, int numCols, String path) throws IOException { Configuration config = new Configuration(); config.set(FileInputFormat.INPUT_DIR, path); verifyInputSplits(numSplits, numCols, config); }
From source file:com.cloudera.recordservice.mapreduce.MapReduceTest.java
License:Apache License
@Test public void testGetSplits() throws IOException { Configuration config = new Configuration(); boolean exceptionThrown = false; try {//from w ww . j a v a 2 s . c o m PlanUtil.getSplits(config, new Credentials()); } catch (IllegalArgumentException e) { exceptionThrown = true; assertTrue(e.getMessage().contains("No input specified")); } assertTrue(exceptionThrown); // Set db/table and make sure it works. config.set(ConfVars.TBL_NAME_CONF.name, "tpch.nation"); PlanUtil.getSplits(config, new Credentials()); // Also set input. This should fail. config.set(FileInputFormat.INPUT_DIR, "/test"); exceptionThrown = false; try { PlanUtil.getSplits(config, new Credentials()); } catch (IllegalArgumentException e) { exceptionThrown = true; assertTrue(e.getMessage(), e.getMessage().contains("More than one input specified")); } assertTrue(exceptionThrown); // Unset the table and set columns. INPUT_DIR and columns don't work now. config.unset(ConfVars.TBL_NAME_CONF.name); config.setStrings(ConfVars.COL_NAMES_CONF.name, "a"); exceptionThrown = false; try { PlanUtil.getSplits(config, new Credentials()); } catch (IllegalArgumentException e) { exceptionThrown = true; assertTrue(e.getMessage().contains("Column projections can only be specified with table inputs.")); } assertTrue(exceptionThrown); // Test some cases that work verifyInputSplitsTable(1, 4, "tpch.nation"); verifyInputSplitsTable(2, 12, "rs.alltypes"); verifyInputSplitsTable(1, 1, "tpch.nation", "n_name"); verifyInputSplitsTable(2, 3, "rs.alltypes", "int_col", "double_col", "string_col"); verifyInputSplitsPath(1, 1, "/test-warehouse/tpch.nation"); // Test some cases using the config utility. config.clear(); RecordServiceConfig.setInputTable(config, null, "tpch.nation", "n_nationkey", "n_comment"); verifyInputSplits(1, 2, config); exceptionThrown = false; try { verifyInputSplitsTable(1, 1, "tpch.nation", "bad"); } catch (IOException e) { exceptionThrown = true; assertTrue(e.getCause() instanceof RecordServiceException); RecordServiceException ex = (RecordServiceException) e.getCause(); assertEquals(RecordServiceException.ErrorCode.INVALID_REQUEST, ex.code); } assertTrue(exceptionThrown); exceptionThrown = false; try { verifyInputSplitsPath(1, 1, "/test-warehouse/tpch.nation,/test-warehouse/tpch.nation"); } catch (IllegalArgumentException e) { exceptionThrown = true; assertTrue(e.getMessage().contains("Only reading a single directory is currently supported.")); } assertTrue(exceptionThrown); }
From source file:com.cloudera.recordservice.mr.PlanUtil.java
License:Apache License
/** * Generates a request from the configs set in jobConf. *//*from w ww . j a v a 2s .c o m*/ public static Request getRequest(Configuration jobConf) throws IOException { LOG.debug("Generating input splits."); String tblName = jobConf.get(ConfVars.TBL_NAME_CONF.name); String inputDir = jobConf.get(FileInputFormat.INPUT_DIR); String sqlQuery = jobConf.get(ConfVars.QUERY_NAME_CONF.name); int numSet = 0; if (tblName != null) ++numSet; if (inputDir != null) ++numSet; if (sqlQuery != null) ++numSet; if (numSet == 0) { throw new IllegalArgumentException("No input specified. Specify either '" + ConfVars.TBL_NAME_CONF.name + "', '" + ConfVars.QUERY_NAME_CONF.name + "' or '" + FileInputFormat.INPUT_DIR + "'"); } if (numSet > 1) { throw new IllegalArgumentException("More than one input specified. Can " + "only specify one of '" + ConfVars.TBL_NAME_CONF.name + "'=" + tblName + ", '" + FileInputFormat.INPUT_DIR + "'=" + inputDir + ", '" + ConfVars.QUERY_NAME_CONF.name + "'=" + sqlQuery); } String[] colNames = jobConf.getStrings(ConfVars.COL_NAMES_CONF.name); if (colNames == null) colNames = new String[0]; if (tblName == null && colNames.length > 0) { // TODO: support this. throw new IllegalArgumentException("Column projections can only be specified with table inputs."); } Request request = null; if (tblName != null) { if (colNames.length == 0) { // If length of colNames = 0, return all possible columns // TODO: this has slightly different meaning than createProjectionRequest() // which treats empty columns as an empty projection. i.e. select * vs count(*) // Reconcile this. request = Request.createTableScanRequest(tblName); } else { List<String> projection = new ArrayList<String>(); for (String c : colNames) { if (c == null || c.isEmpty()) { throw new IllegalArgumentException( "Cannot specify projection with null or empty column name."); } projection.add(c); } request = Request.createProjectionRequest(tblName, projection); } } else if (inputDir != null) { // TODO: inputDir is a comma separate list of paths. The service needs to // handle that. if (inputDir.contains(",")) { throw new IllegalArgumentException("Only reading a single directory is currently supported."); } request = Request.createPathRequest(inputDir); } else if (sqlQuery != null) { request = Request.createSqlRequest(sqlQuery); } else { Preconditions.checkState(false); } return request; }