Example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat INPUT_DIR

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat INPUT_DIR_RECURSIVE.

Prototype

String INPUT_DIR_RECURSIVE

To view the source code for org.apache.hadoop.mapreduce.lib.input FileInputFormat INPUT_DIR_RECURSIVE.

Click Source Link

Usage

From source file:com.mycompany.keywordsearch.KeywordSearch.java

public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    conf.set(FileInputFormat.INPUT_DIR_RECURSIVE, String.valueOf(true));
    Path input = new Path(args[0]);
    Path output = new Path(args[1]);
    BufferedReader in = new BufferedReader(new InputStreamReader(System.in, "UTF-8"));
    System.out.print("Keyword:\t");
    conf.set(KEYWORD, in.readLine());/* www .j a va2s .c  om*/
    Job job = Job.getInstance(conf, "word count");
    job.setJarByClass(KeywordSearch.class);
    job.setInputFormatClass(TextInputFormatV2.class);
    job.setMapperClass(TokenizerMapper.class);
    job.setCombinerClass(IntSumReducer.class);
    job.setReducerClass(IntSumReducer.class);
    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);
    clearOutput(conf, output);
    FileInputFormat.addInputPath(job, input);
    FileOutputFormat.setOutputPath(job, output);
    System.exit(job.waitForCompletion(true) ? 0 : 1);
}

From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHdfsSource.java

License:Apache License

@Override
public List<ConfigIssue> init() {
    List<ConfigIssue> issues = super.init();
    validateHadoopFS(issues);//from  ww  w.  j  av  a  2 s. c om
    // This is for getting no of splits - no of executors
    hadoopConf.set(FileInputFormat.LIST_STATUS_NUM_THREADS, "5"); // Per Hive-on-Spark
    hadoopConf.set(FileInputFormat.SPLIT_MAXSIZE, String.valueOf(750000000)); // Per Hive-on-Spark
    for (Map.Entry<String, String> config : hdfsConfigs.entrySet()) {
        hadoopConf.set(config.getKey(), config.getValue());
    }
    List<Path> hdfsDirPaths = new ArrayList<>();
    if (hdfsDirLocations == null || hdfsDirLocations.isEmpty()) {
        issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                Errors.HADOOPFS_18));
    } else if (issues.isEmpty()) {
        for (String hdfsDirLocation : hdfsDirLocations) {
            try {
                FileSystem fs = getFileSystemForInitDestroy();
                Path ph = fs.makeQualified(new Path(hdfsDirLocation));
                hdfsDirPaths.add(ph);
                if (!fs.exists(ph)) {
                    issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                            Errors.HADOOPFS_10, hdfsDirLocation));
                } else if (!fs.getFileStatus(ph).isDirectory()) {
                    issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                            Errors.HADOOPFS_15, hdfsDirLocation));
                } else {
                    try {
                        FileStatus[] files = fs.listStatus(ph);
                        if (files == null || files.length == 0) {
                            issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(),
                                    "hdfsDirLocations", Errors.HADOOPFS_16, hdfsDirLocation));
                        } else if (getContext().isPreview() && previewBuffer.size() < PREVIEW_SIZE) {
                            for (FileStatus fileStatus : files) {
                                if (fileStatus.isFile()) {
                                    String path = fileStatus.getPath().toString();
                                    try {
                                        List<Map.Entry> buffer;
                                        if (dataFormat == DataFormat.AVRO) {
                                            buffer = previewAvroBatch(fileStatus, PREVIEW_SIZE);
                                        } else {
                                            buffer = previewTextBatch(fileStatus, PREVIEW_SIZE);
                                        }
                                        for (int i = 0; i < buffer.size()
                                                && previewBuffer.size() < PREVIEW_SIZE; i++) {
                                            Map.Entry entry = buffer.get(i);
                                            previewBuffer.put(String.valueOf(entry.getKey()),
                                                    entry.getValue() == null ? null : entry.getValue());
                                        }
                                    } catch (IOException | InterruptedException ex) {
                                        String msg = "Error opening " + path + ": " + ex;
                                        LOG.info(msg, ex);
                                        issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(),
                                                "hdfsDirLocations", Errors.HADOOPFS_16, fileStatus.getPath()));
                                    }
                                }
                            }
                        }
                    } catch (IOException ex) {
                        issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                                Errors.HADOOPFS_09, hdfsDirLocation, ex.toString(), ex));
                    }
                }
            } catch (IOException ioe) {
                LOG.warn("Error connecting to HDFS filesystem: " + ioe, ioe);
                issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                        Errors.HADOOPFS_11, hdfsDirLocation, ioe.toString(), ioe));
            }
        }
    }
    hadoopConf.set(FileInputFormat.INPUT_DIR, StringUtils.join(hdfsDirPaths, ","));
    hadoopConf.set(FileInputFormat.INPUT_DIR_RECURSIVE, Boolean.toString(recursive));
    switch (dataFormat) {
    case JSON:
        if (jsonMaxObjectLen < 1) {
            issues.add(
                    getContext().createConfigIssue(Groups.JSON.name(), "jsonMaxObjectLen", Errors.HADOOPFS_04));
        }
        break;
    case TEXT:
        if (textMaxLineLen < 1) {
            issues.add(
                    getContext().createConfigIssue(Groups.TEXT.name(), "textMaxLineLen", Errors.HADOOPFS_05));
        }
        break;
    case LOG:
        logDataFormatValidator = new LogDataFormatValidator(logMode, logMaxObjectLen, retainOriginalLine,
                customLogFormat, regex, grokPatternDefinition, grokPattern, enableLog4jCustomLogFormat,
                log4jCustomLogFormat, OnParseError.ERROR, 0, Groups.LOG.name(),
                getFieldPathToGroupMap(fieldPathsToGroupName));
        logDataFormatValidator.validateLogFormatConfig(issues, getContext());
        break;
    case DELIMITED:
        if (csvMaxObjectLen < 1) {
            issues.add(getContext().createConfigIssue(Groups.DELIMITED.name(), "csvMaxObjectLen",
                    Errors.HADOOPFS_30));
        }
        break;
    case AVRO:
        if (avroSchema != null && !avroSchema.isEmpty()) {
            hadoopConf.set(AvroJob.INPUT_SCHEMA, avroSchema);
            hadoopConf.set(CONF_INPUT_KEY_SCHEMA, avroSchema);
        }
        break;
    default:
        issues.add(getContext().createConfigIssue(Groups.LOG.name(), "dataFormat", Errors.HADOOPFS_06,
                dataFormat));
    }
    validateParserFactoryConfigs(issues);
    LOG.info("Issues: " + issues);
    return issues;
}

From source file:com.uber.hoodie.utilities.HDFSParquetImporter.java

License:Apache License

protected JavaRDD<HoodieRecord<HoodieRecordPayload>> buildHoodieRecordsForImport(JavaSparkContext jsc,
        String schemaStr) throws IOException {
    Job job = Job.getInstance(jsc.hadoopConfiguration());
    // Allow recursive directories to be found
    job.getConfiguration().set(FileInputFormat.INPUT_DIR_RECURSIVE, "true");
    // To parallelize reading file status.
    job.getConfiguration().set(FileInputFormat.LIST_STATUS_NUM_THREADS, "1024");
    AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr)));
    ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class));

    return jsc//from w  w w  . j  a  v  a 2s. c o  m
            .newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class,
                    job.getConfiguration())
            // To reduce large number of
            // tasks.
            .coalesce(16 * cfg.parallelism).map(entry -> {
                GenericRecord genericRecord = ((Tuple2<Void, GenericRecord>) entry)._2();
                Object partitionField = genericRecord.get(cfg.partitionKey);
                if (partitionField == null) {
                    throw new HoodieIOException("partition key is missing. :" + cfg.partitionKey);
                }
                Object rowField = genericRecord.get(cfg.rowKey);
                if (rowField == null) {
                    throw new HoodieIOException("row field is missing. :" + cfg.rowKey);
                }
                String partitionPath = partitionField.toString();
                logger.info("Row Key : " + rowField + ", Partition Path is (" + partitionPath + ")");
                if (partitionField instanceof Number) {
                    try {
                        long ts = (long) (Double.parseDouble(partitionField.toString()) * 1000L);
                        partitionPath = PARTITION_FORMATTER.format(new Date(ts));
                    } catch (NumberFormatException nfe) {
                        logger.warn("Unable to parse date from partition field. Assuming partition as ("
                                + partitionField + ")");
                    }
                }
                return new HoodieRecord<>(new HoodieKey((String) rowField, partitionPath),
                        new HoodieJsonPayload(genericRecord.toString()));
            });
}

From source file:kina.config.GenericMongoKinaConfig.java

License:Apache License

private void initBSONDumpConfig() {
    inputFormatClass = KinaBSONFileInputFormat.class;

    Path path = new Path(bsonFile);
    try {//  w  w  w . ja  va2  s  . c o m
        path = path.getFileSystem(configHadoop).makeQualified(path);
        if (!path.getFileSystem(configHadoop).exists(path)) {
            throw new IOException(new FileNotFoundException(path.getName()));
        }

        String dirStr = org.apache.hadoop.util.StringUtils.escapeString(path.toString());
        String dirs = configHadoop.get(FileInputFormat.INPUT_DIR);
        configHadoop.set(FileInputFormat.INPUT_DIR, dirs == null ? dirStr : dirs + "," + dirStr);
        configHadoop.set(FileInputFormat.INPUT_DIR_RECURSIVE, recursiveBsonFileDiscovery.toString());

        configHadoop.setClass(FileInputFormat.PATHFILTER_CLASS, KinaMongoPathFilter.class, PathFilter.class);

        if (bsonFilesExcludePatterns != null) {
            configHadoop.setStrings(KinaMongoPathFilter.PATH_FILTER_CONF, bsonFilesExcludePatterns);
        }

    } catch (IOException e) {
        throw new kina.exceptions.IOException(e);
    }
}

From source file:org.apache.pig.backend.hadoop.executionengine.tez.util.MRToTezHelper.java

License:Apache License

private static void populateMRSettingsToRetain() {

    // FileInputFormat
    mrSettingsToRetain.add(FileInputFormat.INPUT_DIR);
    mrSettingsToRetain.add(FileInputFormat.SPLIT_MAXSIZE);
    mrSettingsToRetain.add(FileInputFormat.SPLIT_MINSIZE);
    mrSettingsToRetain.add(FileInputFormat.PATHFILTER_CLASS);
    mrSettingsToRetain.add(FileInputFormat.NUM_INPUT_FILES);
    mrSettingsToRetain.add(FileInputFormat.INPUT_DIR_RECURSIVE);

    // FileOutputFormat
    mrSettingsToRetain.add(MRConfiguration.OUTPUT_BASENAME);
    mrSettingsToRetain.add(FileOutputFormat.COMPRESS);
    mrSettingsToRetain.add(FileOutputFormat.COMPRESS_CODEC);
    mrSettingsToRetain.add(FileOutputFormat.COMPRESS_TYPE);
    mrSettingsToRetain.add(FileOutputFormat.OUTDIR);
    mrSettingsToRetain.add(FileOutputCommitter.SUCCESSFUL_JOB_OUTPUT_DIR_MARKER);
}

Example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat INPUT_DIR_RECURSIVE

Introduction

Prototype

Usage