List of usage examples for org.apache.hadoop.mapreduce.lib.input FileInputFormat INPUT_DIR_RECURSIVE
String INPUT_DIR_RECURSIVE
To view the source code for org.apache.hadoop.mapreduce.lib.input FileInputFormat INPUT_DIR_RECURSIVE.
Click Source Link
From source file:com.mycompany.keywordsearch.KeywordSearch.java
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); conf.set(FileInputFormat.INPUT_DIR_RECURSIVE, String.valueOf(true)); Path input = new Path(args[0]); Path output = new Path(args[1]); BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8")); System.out.print("Keyword:\t"); conf.set(KEYWORD, in.readLine());/* www .j a va2s .c om*/ Job job = Job.getInstance(conf, "word count"); job.setJarByClass(KeywordSearch.class); job.setInputFormatClass(TextInputFormatV2.class); job.setMapperClass(TokenizerMapper.class); job.setCombinerClass(IntSumReducer.class); job.setReducerClass(IntSumReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); clearOutput(conf, output); FileInputFormat.addInputPath(job, input); FileOutputFormat.setOutputPath(job, output); System.exit(job.waitForCompletion(true) ? 0 : 1); }
From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHdfsSource.java
License:Apache License
@Override public List<ConfigIssue> init() { List<ConfigIssue> issues = super.init(); validateHadoopFS(issues);//from ww w. j av a 2 s. c om // This is for getting no of splits - no of executors hadoopConf.set(FileInputFormat.LIST_STATUS_NUM_THREADS, "5"); // Per Hive-on-Spark hadoopConf.set(FileInputFormat.SPLIT_MAXSIZE, String.valueOf(750000000)); // Per Hive-on-Spark for (Map.Entry<String, String> config : hdfsConfigs.entrySet()) { hadoopConf.set(config.getKey(), config.getValue()); } List<Path> hdfsDirPaths = new ArrayList<>(); if (hdfsDirLocations == null || hdfsDirLocations.isEmpty()) { issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_18)); } else if (issues.isEmpty()) { for (String hdfsDirLocation : hdfsDirLocations) { try { FileSystem fs = getFileSystemForInitDestroy(); Path ph = fs.makeQualified(new Path(hdfsDirLocation)); hdfsDirPaths.add(ph); if (!fs.exists(ph)) { issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_10, hdfsDirLocation)); } else if (!fs.getFileStatus(ph).isDirectory()) { issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_15, hdfsDirLocation)); } else { try { FileStatus[] files = fs.listStatus(ph); if (files == null || files.length == 0) { issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_16, hdfsDirLocation)); } else if (getContext().isPreview() && previewBuffer.size() < PREVIEW_SIZE) { for (FileStatus fileStatus : files) { if (fileStatus.isFile()) { String path = fileStatus.getPath().toString(); try { List<Map.Entry> buffer; if (dataFormat == DataFormat.AVRO) { buffer = previewAvroBatch(fileStatus, PREVIEW_SIZE); } else { buffer = previewTextBatch(fileStatus, PREVIEW_SIZE); } for (int i = 0; i < buffer.size() && previewBuffer.size() < PREVIEW_SIZE; i++) { Map.Entry entry = buffer.get(i); previewBuffer.put(String.valueOf(entry.getKey()), entry.getValue() == null ? null : entry.getValue()); } } catch (IOException | InterruptedException ex) { String msg = "Error opening " + path + ": " + ex; LOG.info(msg, ex); issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_16, fileStatus.getPath())); } } } } } catch (IOException ex) { issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_09, hdfsDirLocation, ex.toString(), ex)); } } } catch (IOException ioe) { LOG.warn("Error connecting to HDFS filesystem: " + ioe, ioe); issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_11, hdfsDirLocation, ioe.toString(), ioe)); } } } hadoopConf.set(FileInputFormat.INPUT_DIR, StringUtils.join(hdfsDirPaths, ",")); hadoopConf.set(FileInputFormat.INPUT_DIR_RECURSIVE, Boolean.toString(recursive)); switch (dataFormat) { case JSON: if (jsonMaxObjectLen < 1) { issues.add( getContext().createConfigIssue(Groups.JSON.name(), "jsonMaxObjectLen", Errors.HADOOPFS_04)); } break; case TEXT: if (textMaxLineLen < 1) { issues.add( getContext().createConfigIssue(Groups.TEXT.name(), "textMaxLineLen", Errors.HADOOPFS_05)); } break; case LOG: logDataFormatValidator = new LogDataFormatValidator(logMode, logMaxObjectLen, retainOriginalLine, customLogFormat, regex, grokPatternDefinition, grokPattern, enableLog4jCustomLogFormat, log4jCustomLogFormat, OnParseError.ERROR, 0, Groups.LOG.name(), getFieldPathToGroupMap(fieldPathsToGroupName)); logDataFormatValidator.validateLogFormatConfig(issues, getContext()); break; case DELIMITED: if (csvMaxObjectLen < 1) { issues.add(getContext().createConfigIssue(Groups.DELIMITED.name(), "csvMaxObjectLen", Errors.HADOOPFS_30)); } break; case AVRO: if (avroSchema != null && !avroSchema.isEmpty()) { hadoopConf.set(AvroJob.INPUT_SCHEMA, avroSchema); hadoopConf.set(CONF_INPUT_KEY_SCHEMA, avroSchema); } break; default: issues.add(getContext().createConfigIssue(Groups.LOG.name(), "dataFormat", Errors.HADOOPFS_06, dataFormat)); } validateParserFactoryConfigs(issues); LOG.info("Issues: " + issues); return issues; }
From source file:com.uber.hoodie.utilities.HDFSParquetImporter.java
License:Apache License
protected JavaRDD<HoodieRecord<HoodieRecordPayload>> buildHoodieRecordsForImport(JavaSparkContext jsc, String schemaStr) throws IOException { Job job = Job.getInstance(jsc.hadoopConfiguration()); // Allow recursive directories to be found job.getConfiguration().set(FileInputFormat.INPUT_DIR_RECURSIVE, "true"); // To parallelize reading file status. job.getConfiguration().set(FileInputFormat.LIST_STATUS_NUM_THREADS, "1024"); AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr))); ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class)); return jsc//from w w w . j a v a 2s. c o m .newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class, job.getConfiguration()) // To reduce large number of // tasks. .coalesce(16 * cfg.parallelism).map(entry -> { GenericRecord genericRecord = ((Tuple2<Void, GenericRecord>) entry)._2(); Object partitionField = genericRecord.get(cfg.partitionKey); if (partitionField == null) { throw new HoodieIOException("partition key is missing. :" + cfg.partitionKey); } Object rowField = genericRecord.get(cfg.rowKey); if (rowField == null) { throw new HoodieIOException("row field is missing. :" + cfg.rowKey); } String partitionPath = partitionField.toString(); logger.info("Row Key : " + rowField + ", Partition Path is (" + partitionPath + ")"); if (partitionField instanceof Number) { try { long ts = (long) (Double.parseDouble(partitionField.toString()) * 1000L); partitionPath = PARTITION_FORMATTER.format(new Date(ts)); } catch (NumberFormatException nfe) { logger.warn("Unable to parse date from partition field. Assuming partition as (" + partitionField + ")"); } } return new HoodieRecord<>(new HoodieKey((String) rowField, partitionPath), new HoodieJsonPayload(genericRecord.toString())); }); }
From source file:kina.config.GenericMongoKinaConfig.java
License:Apache License
private void initBSONDumpConfig() { inputFormatClass = KinaBSONFileInputFormat.class; Path path = new Path(bsonFile); try {// w w w . ja va2 s . c o m path = path.getFileSystem(configHadoop).makeQualified(path); if (!path.getFileSystem(configHadoop).exists(path)) { throw new IOException(new FileNotFoundException(path.getName())); } String dirStr = org.apache.hadoop.util.StringUtils.escapeString(path.toString()); String dirs = configHadoop.get(FileInputFormat.INPUT_DIR); configHadoop.set(FileInputFormat.INPUT_DIR, dirs == null ? dirStr : dirs + "," + dirStr); configHadoop.set(FileInputFormat.INPUT_DIR_RECURSIVE, recursiveBsonFileDiscovery.toString()); configHadoop.setClass(FileInputFormat.PATHFILTER_CLASS, KinaMongoPathFilter.class, PathFilter.class); if (bsonFilesExcludePatterns != null) { configHadoop.setStrings(KinaMongoPathFilter.PATH_FILTER_CONF, bsonFilesExcludePatterns); } } catch (IOException e) { throw new kina.exceptions.IOException(e); } }
From source file:org.apache.pig.backend.hadoop.executionengine.tez.util.MRToTezHelper.java
License:Apache License
private static void populateMRSettingsToRetain() { // FileInputFormat mrSettingsToRetain.add(FileInputFormat.INPUT_DIR); mrSettingsToRetain.add(FileInputFormat.SPLIT_MAXSIZE); mrSettingsToRetain.add(FileInputFormat.SPLIT_MINSIZE); mrSettingsToRetain.add(FileInputFormat.PATHFILTER_CLASS); mrSettingsToRetain.add(FileInputFormat.NUM_INPUT_FILES); mrSettingsToRetain.add(FileInputFormat.INPUT_DIR_RECURSIVE); // FileOutputFormat mrSettingsToRetain.add(MRConfiguration.OUTPUT_BASENAME); mrSettingsToRetain.add(FileOutputFormat.COMPRESS); mrSettingsToRetain.add(FileOutputFormat.COMPRESS_CODEC); mrSettingsToRetain.add(FileOutputFormat.COMPRESS_TYPE); mrSettingsToRetain.add(FileOutputFormat.OUTDIR); mrSettingsToRetain.add(FileOutputCommitter.SUCCESSFUL_JOB_OUTPUT_DIR_MARKER); }