Example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat LIST_STATUS_NUM_THREADS

List of usage examples for org.apache.hadoop.mapreduce.lib.input FileInputFormat LIST_STATUS_NUM_THREADS

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat LIST_STATUS_NUM_THREADS.

Prototype

String LIST_STATUS_NUM_THREADS

To view the source code for org.apache.hadoop.mapreduce.lib.input FileInputFormat LIST_STATUS_NUM_THREADS.

Click Source Link

Usage

From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHdfsSource.java

License:Apache License

@Override
public List<ConfigIssue> init() {
    List<ConfigIssue> issues = super.init();
    validateHadoopFS(issues);/*from w  ww. ja v a2  s . c  o m*/
    // This is for getting no of splits - no of executors
    hadoopConf.set(FileInputFormat.LIST_STATUS_NUM_THREADS, "5"); // Per Hive-on-Spark
    hadoopConf.set(FileInputFormat.SPLIT_MAXSIZE, String.valueOf(750000000)); // Per Hive-on-Spark
    for (Map.Entry<String, String> config : hdfsConfigs.entrySet()) {
        hadoopConf.set(config.getKey(), config.getValue());
    }
    List<Path> hdfsDirPaths = new ArrayList<>();
    if (hdfsDirLocations == null || hdfsDirLocations.isEmpty()) {
        issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                Errors.HADOOPFS_18));
    } else if (issues.isEmpty()) {
        for (String hdfsDirLocation : hdfsDirLocations) {
            try {
                FileSystem fs = getFileSystemForInitDestroy();
                Path ph = fs.makeQualified(new Path(hdfsDirLocation));
                hdfsDirPaths.add(ph);
                if (!fs.exists(ph)) {
                    issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                            Errors.HADOOPFS_10, hdfsDirLocation));
                } else if (!fs.getFileStatus(ph).isDirectory()) {
                    issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                            Errors.HADOOPFS_15, hdfsDirLocation));
                } else {
                    try {
                        FileStatus[] files = fs.listStatus(ph);
                        if (files == null || files.length == 0) {
                            issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(),
                                    "hdfsDirLocations", Errors.HADOOPFS_16, hdfsDirLocation));
                        } else if (getContext().isPreview() && previewBuffer.size() < PREVIEW_SIZE) {
                            for (FileStatus fileStatus : files) {
                                if (fileStatus.isFile()) {
                                    String path = fileStatus.getPath().toString();
                                    try {
                                        List<Map.Entry> buffer;
                                        if (dataFormat == DataFormat.AVRO) {
                                            buffer = previewAvroBatch(fileStatus, PREVIEW_SIZE);
                                        } else {
                                            buffer = previewTextBatch(fileStatus, PREVIEW_SIZE);
                                        }
                                        for (int i = 0; i < buffer.size()
                                                && previewBuffer.size() < PREVIEW_SIZE; i++) {
                                            Map.Entry entry = buffer.get(i);
                                            previewBuffer.put(String.valueOf(entry.getKey()),
                                                    entry.getValue() == null ? null : entry.getValue());
                                        }
                                    } catch (IOException | InterruptedException ex) {
                                        String msg = "Error opening " + path + ": " + ex;
                                        LOG.info(msg, ex);
                                        issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(),
                                                "hdfsDirLocations", Errors.HADOOPFS_16, fileStatus.getPath()));
                                    }
                                }
                            }
                        }
                    } catch (IOException ex) {
                        issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                                Errors.HADOOPFS_09, hdfsDirLocation, ex.toString(), ex));
                    }
                }
            } catch (IOException ioe) {
                LOG.warn("Error connecting to HDFS filesystem: " + ioe, ioe);
                issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                        Errors.HADOOPFS_11, hdfsDirLocation, ioe.toString(), ioe));
            }
        }
    }
    hadoopConf.set(FileInputFormat.INPUT_DIR, StringUtils.join(hdfsDirPaths, ","));
    hadoopConf.set(FileInputFormat.INPUT_DIR_RECURSIVE, Boolean.toString(recursive));
    switch (dataFormat) {
    case JSON:
        if (jsonMaxObjectLen < 1) {
            issues.add(
                    getContext().createConfigIssue(Groups.JSON.name(), "jsonMaxObjectLen", Errors.HADOOPFS_04));
        }
        break;
    case TEXT:
        if (textMaxLineLen < 1) {
            issues.add(
                    getContext().createConfigIssue(Groups.TEXT.name(), "textMaxLineLen", Errors.HADOOPFS_05));
        }
        break;
    case LOG:
        logDataFormatValidator = new LogDataFormatValidator(logMode, logMaxObjectLen, retainOriginalLine,
                customLogFormat, regex, grokPatternDefinition, grokPattern, enableLog4jCustomLogFormat,
                log4jCustomLogFormat, OnParseError.ERROR, 0, Groups.LOG.name(),
                getFieldPathToGroupMap(fieldPathsToGroupName));
        logDataFormatValidator.validateLogFormatConfig(issues, getContext());
        break;
    case DELIMITED:
        if (csvMaxObjectLen < 1) {
            issues.add(getContext().createConfigIssue(Groups.DELIMITED.name(), "csvMaxObjectLen",
                    Errors.HADOOPFS_30));
        }
        break;
    case AVRO:
        if (avroSchema != null && !avroSchema.isEmpty()) {
            hadoopConf.set(AvroJob.INPUT_SCHEMA, avroSchema);
            hadoopConf.set(CONF_INPUT_KEY_SCHEMA, avroSchema);
        }
        break;
    default:
        issues.add(getContext().createConfigIssue(Groups.LOG.name(), "dataFormat", Errors.HADOOPFS_06,
                dataFormat));
    }
    validateParserFactoryConfigs(issues);
    LOG.info("Issues: " + issues);
    return issues;
}

From source file:com.uber.hoodie.utilities.HDFSParquetImporter.java

License:Apache License

protected JavaRDD<HoodieRecord<HoodieRecordPayload>> buildHoodieRecordsForImport(JavaSparkContext jsc,
        String schemaStr) throws IOException {
    Job job = Job.getInstance(jsc.hadoopConfiguration());
    // Allow recursive directories to be found
    job.getConfiguration().set(FileInputFormat.INPUT_DIR_RECURSIVE, "true");
    // To parallelize reading file status.
    job.getConfiguration().set(FileInputFormat.LIST_STATUS_NUM_THREADS, "1024");
    AvroReadSupport.setAvroReadSchema(jsc.hadoopConfiguration(), (new Schema.Parser().parse(schemaStr)));
    ParquetInputFormat.setReadSupportClass(job, (AvroReadSupport.class));

    return jsc/*from  ww w  . j  ava  2 s  .  c o  m*/
            .newAPIHadoopFile(cfg.srcPath, ParquetInputFormat.class, Void.class, GenericRecord.class,
                    job.getConfiguration())
            // To reduce large number of
            // tasks.
            .coalesce(16 * cfg.parallelism).map(entry -> {
                GenericRecord genericRecord = ((Tuple2<Void, GenericRecord>) entry)._2();
                Object partitionField = genericRecord.get(cfg.partitionKey);
                if (partitionField == null) {
                    throw new HoodieIOException("partition key is missing. :" + cfg.partitionKey);
                }
                Object rowField = genericRecord.get(cfg.rowKey);
                if (rowField == null) {
                    throw new HoodieIOException("row field is missing. :" + cfg.rowKey);
                }
                String partitionPath = partitionField.toString();
                logger.info("Row Key : " + rowField + ", Partition Path is (" + partitionPath + ")");
                if (partitionField instanceof Number) {
                    try {
                        long ts = (long) (Double.parseDouble(partitionField.toString()) * 1000L);
                        partitionPath = PARTITION_FORMATTER.format(new Date(ts));
                    } catch (NumberFormatException nfe) {
                        logger.warn("Unable to parse date from partition field. Assuming partition as ("
                                + partitionField + ")");
                    }
                }
                return new HoodieRecord<>(new HoodieKey((String) rowField, partitionPath),
                        new HoodieJsonPayload(genericRecord.toString()));
            });
}