Example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat SPLIT

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat SPLIT_MAXSIZE.

Prototype

String SPLIT_MAXSIZE

To view the source code for org.apache.hadoop.mapreduce.lib.input FileInputFormat SPLIT_MAXSIZE.

Click Source Link

Usage

From source file:com.msd.gin.halyard.tools.HalyardBulkLoad.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        System.err.println("Usage: bulkload [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D"
                + SKIP_INVALID_PROPERTY + "=true] [-D" + SPLIT_BITS_PROPERTY + "=8] [-D"
                + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY
                + "=true] <input_path(s)> <output_path> <table_name>");
        return -1;
    }//from w  w  w  .  j  a  va2s  .  c  o m
    TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class,
            RDFFormat.class, RDFParser.class);
    HBaseConfiguration.addHbaseResources(getConf());
    if (SnappyCodec.isNativeCodeLoaded()) {
        getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
        getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);
    }
    getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0);
    getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l);
    getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100);
    getConf().setInt(MRJobConfig.IO_SORT_MB, 1000);
    getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000);
    getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048);
    Job job = Job.getInstance(getConf(), "HalyardBulkLoad -> " + args[1] + " -> " + args[2]);
    job.setJarByClass(HalyardBulkLoad.class);
    job.setMapperClass(RDFMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setInputFormatClass(RioFileInputFormat.class);
    job.setSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    Map<String, Integer> contextSplitsMap = new HashMap<>();
    for (Map.Entry<String, String> me : getConf().getValByRegex(CONTEXT_SPLIT_REGEXP).entrySet()) {
        int splits = Integer.parseInt(me.getKey().substring(me.getKey().lastIndexOf('.') + 1));
        StringTokenizer stk = new StringTokenizer(me.getValue(), ",");
        while (stk.hasMoreTokens()) {
            contextSplitsMap.put(stk.nextToken(), splits);
        }
    }
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], true,
            getConf().getInt(SPLIT_BITS_PROPERTY, 3), contextSplitsMap)) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(),
                hTable.getRegionLocator());
        FileInputFormat.setInputDirRecursive(job, true);
        FileInputFormat.setInputPaths(job, args[0]);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        TableMapReduceUtil.addDependencyJars(job);
        TableMapReduceUtil.initCredentials(job);
        if (job.waitForCompletion(true)) {
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable);
            LOG.info("Bulk Load Completed..");
            return 0;
        }
    }
    return -1;
}

From source file:com.msd.gin.halyard.tools.HalyardBulkUpdate.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        System.err.println("Usage: bulkupdate [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D"
                + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY
                + "=true] <input_file_with_SPARQL_queries> <output_path> <table_name>");
        return -1;
    }// ww w . j a  v  a 2 s . c  o  m
    TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, RDFFormat.class,
            RDFParser.class);
    HBaseConfiguration.addHbaseResources(getConf());
    if (SnappyCodec.isNativeCodeLoaded()) {
        getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
        getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);
    }
    getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0);
    getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l);
    getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100);
    getConf().setInt(MRJobConfig.IO_SORT_MB, 1000);
    getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000);
    getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048);
    getConf().setStrings(TABLE_NAME_PROPERTY, args[2]);
    Job job = Job.getInstance(getConf(), "HalyardBulkUpdate -> " + args[1] + " -> " + args[2]);
    NLineInputFormat.setNumLinesPerSplit(job, 1);
    job.setJarByClass(HalyardBulkUpdate.class);
    job.setMapperClass(SPARQLMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setInputFormatClass(NLineInputFormat.class);
    job.setSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], false, 0, null)) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(),
                hTable.getRegionLocator());
        FileInputFormat.setInputPaths(job, args[0]);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        TableMapReduceUtil.addDependencyJars(job);
        TableMapReduceUtil.initCredentials(job);
        if (job.waitForCompletion(true)) {
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable);
            LOG.info("Bulk Update Completed..");
            return 0;
        }
    }
    return -1;
}

From source file:com.msd.gin.halyard.tools.HalyardHiveLoad.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    if (args.length != 3) {
        System.err.println("Usage: hiveload -D" + RDF_MIME_TYPE_PROPERTY + "='application/ld+json' [-D"
                + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D" + HIVE_DATA_COLUMN_INDEX_PROPERTY + "=3] [-D"
                + BASE_URI_PROPERTY + "='http://my_base_uri/'] [-D" + HalyardBulkLoad.SPLIT_BITS_PROPERTY
                + "=8] [-D" + HalyardBulkLoad.DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D"
                + HalyardBulkLoad.OVERRIDE_CONTEXT_PROPERTY
                + "=true] <hive_table_name> <output_path> <hbase_table_name>");
        return -1;
    }/*from w w  w.  j  a  v  a 2s .  c  o  m*/
    TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class,
            RDFFormat.class, RDFParser.class);
    HBaseConfiguration.addHbaseResources(getConf());
    if (SnappyCodec.isNativeCodeLoaded()) {
        getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true);
        getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class);
    }
    getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0);
    getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l);
    getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100);
    getConf().setInt(MRJobConfig.IO_SORT_MB, 1000);
    getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000);
    getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048);
    Job job = Job.getInstance(getConf(), "HalyardHiveLoad -> " + args[1] + " -> " + args[2]);
    int i = args[0].indexOf('.');
    HCatInputFormat.setInput(job, i > 0 ? args[0].substring(0, i) : null, args[0].substring(i + 1));
    job.setJarByClass(HalyardHiveLoad.class);
    job.setMapperClass(HiveMapper.class);
    job.setMapOutputKeyClass(ImmutableBytesWritable.class);
    job.setMapOutputValueClass(KeyValue.class);
    job.setInputFormatClass(HCatInputFormat.class);
    job.setSpeculativeExecution(false);
    job.setReduceSpeculativeExecution(false);
    Map<String, Integer> contextSplitsMap = new HashMap<>();
    for (Map.Entry<String, String> me : getConf().getValByRegex(HalyardBulkLoad.CONTEXT_SPLIT_REGEXP)
            .entrySet()) {
        int splits = Integer.parseInt(me.getKey().substring(me.getKey().lastIndexOf('.') + 1));
        StringTokenizer stk = new StringTokenizer(me.getValue(), ",");
        while (stk.hasMoreTokens()) {
            contextSplitsMap.put(stk.nextToken(), splits);
        }
    }
    try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], true,
            getConf().getInt(HalyardBulkLoad.SPLIT_BITS_PROPERTY, 3), contextSplitsMap)) {
        HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(),
                hTable.getRegionLocator());
        FileInputFormat.setInputDirRecursive(job, true);
        FileInputFormat.setInputPaths(job, args[0]);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        TableMapReduceUtil.addDependencyJars(job);
        TableMapReduceUtil.initCredentials(job);
        if (job.waitForCompletion(true)) {
            new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable);
            LOG.info("Bulk Load Completed..");
            return 0;
        }
    }
    return -1;
}

From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHdfsSource.java

License:Apache License

@Override
public List<ConfigIssue> init() {
    List<ConfigIssue> issues = super.init();
    validateHadoopFS(issues);// w w w .  ja  va 2 s .  c  o  m
    // This is for getting no of splits - no of executors
    hadoopConf.set(FileInputFormat.LIST_STATUS_NUM_THREADS, "5"); // Per Hive-on-Spark
    hadoopConf.set(FileInputFormat.SPLIT_MAXSIZE, String.valueOf(750000000)); // Per Hive-on-Spark
    for (Map.Entry<String, String> config : hdfsConfigs.entrySet()) {
        hadoopConf.set(config.getKey(), config.getValue());
    }
    List<Path> hdfsDirPaths = new ArrayList<>();
    if (hdfsDirLocations == null || hdfsDirLocations.isEmpty()) {
        issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                Errors.HADOOPFS_18));
    } else if (issues.isEmpty()) {
        for (String hdfsDirLocation : hdfsDirLocations) {
            try {
                FileSystem fs = getFileSystemForInitDestroy();
                Path ph = fs.makeQualified(new Path(hdfsDirLocation));
                hdfsDirPaths.add(ph);
                if (!fs.exists(ph)) {
                    issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                            Errors.HADOOPFS_10, hdfsDirLocation));
                } else if (!fs.getFileStatus(ph).isDirectory()) {
                    issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                            Errors.HADOOPFS_15, hdfsDirLocation));
                } else {
                    try {
                        FileStatus[] files = fs.listStatus(ph);
                        if (files == null || files.length == 0) {
                            issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(),
                                    "hdfsDirLocations", Errors.HADOOPFS_16, hdfsDirLocation));
                        } else if (getContext().isPreview() && previewBuffer.size() < PREVIEW_SIZE) {
                            for (FileStatus fileStatus : files) {
                                if (fileStatus.isFile()) {
                                    String path = fileStatus.getPath().toString();
                                    try {
                                        List<Map.Entry> buffer;
                                        if (dataFormat == DataFormat.AVRO) {
                                            buffer = previewAvroBatch(fileStatus, PREVIEW_SIZE);
                                        } else {
                                            buffer = previewTextBatch(fileStatus, PREVIEW_SIZE);
                                        }
                                        for (int i = 0; i < buffer.size()
                                                && previewBuffer.size() < PREVIEW_SIZE; i++) {
                                            Map.Entry entry = buffer.get(i);
                                            previewBuffer.put(String.valueOf(entry.getKey()),
                                                    entry.getValue() == null ? null : entry.getValue());
                                        }
                                    } catch (IOException | InterruptedException ex) {
                                        String msg = "Error opening " + path + ": " + ex;
                                        LOG.info(msg, ex);
                                        issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(),
                                                "hdfsDirLocations", Errors.HADOOPFS_16, fileStatus.getPath()));
                                    }
                                }
                            }
                        }
                    } catch (IOException ex) {
                        issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                                Errors.HADOOPFS_09, hdfsDirLocation, ex.toString(), ex));
                    }
                }
            } catch (IOException ioe) {
                LOG.warn("Error connecting to HDFS filesystem: " + ioe, ioe);
                issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations",
                        Errors.HADOOPFS_11, hdfsDirLocation, ioe.toString(), ioe));
            }
        }
    }
    hadoopConf.set(FileInputFormat.INPUT_DIR, StringUtils.join(hdfsDirPaths, ","));
    hadoopConf.set(FileInputFormat.INPUT_DIR_RECURSIVE, Boolean.toString(recursive));
    switch (dataFormat) {
    case JSON:
        if (jsonMaxObjectLen < 1) {
            issues.add(
                    getContext().createConfigIssue(Groups.JSON.name(), "jsonMaxObjectLen", Errors.HADOOPFS_04));
        }
        break;
    case TEXT:
        if (textMaxLineLen < 1) {
            issues.add(
                    getContext().createConfigIssue(Groups.TEXT.name(), "textMaxLineLen", Errors.HADOOPFS_05));
        }
        break;
    case LOG:
        logDataFormatValidator = new LogDataFormatValidator(logMode, logMaxObjectLen, retainOriginalLine,
                customLogFormat, regex, grokPatternDefinition, grokPattern, enableLog4jCustomLogFormat,
                log4jCustomLogFormat, OnParseError.ERROR, 0, Groups.LOG.name(),
                getFieldPathToGroupMap(fieldPathsToGroupName));
        logDataFormatValidator.validateLogFormatConfig(issues, getContext());
        break;
    case DELIMITED:
        if (csvMaxObjectLen < 1) {
            issues.add(getContext().createConfigIssue(Groups.DELIMITED.name(), "csvMaxObjectLen",
                    Errors.HADOOPFS_30));
        }
        break;
    case AVRO:
        if (avroSchema != null && !avroSchema.isEmpty()) {
            hadoopConf.set(AvroJob.INPUT_SCHEMA, avroSchema);
            hadoopConf.set(CONF_INPUT_KEY_SCHEMA, avroSchema);
        }
        break;
    default:
        issues.add(getContext().createConfigIssue(Groups.LOG.name(), "dataFormat", Errors.HADOOPFS_06,
                dataFormat));
    }
    validateParserFactoryConfigs(issues);
    LOG.info("Issues: " + issues);
    return issues;
}

From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopMapReduceEmbeddedSelfTest.java

License:Apache License

/**
 * Tests whole job execution with all phases in old and new versions of API with definition of custom
 * Serialization, Partitioner and IO formats.
 * @throws Exception If fails.//from w ww. j  av  a2s. c  om
 */
public void testMultiReducerWholeMapReduceExecution() throws Exception {
    IgfsPath inDir = new IgfsPath(PATH_INPUT);

    igfs.mkdirs(inDir);

    IgfsPath inFile = new IgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input");

    generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5",
            12000, "key6", 18000);

    for (int i = 0; i < 2; i++) {
        boolean useNewAPI = i == 1;

        igfs.delete(new IgfsPath(PATH_OUTPUT), true);

        flags.put("serializationWasConfigured", false);
        flags.put("partitionerWasConfigured", false);
        flags.put("inputFormatWasConfigured", false);
        flags.put("outputFormatWasConfigured", false);

        JobConf jobConf = new JobConf();

        jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

        //To split into about 6-7 items for v2
        jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

        //For v1
        jobConf.setInt("fs.local.block.size", 65000);

        // File system coordinates.
        setupFileSystems(jobConf);

        GridHadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI);

        if (!useNewAPI) {
            jobConf.setPartitionerClass(CustomV1Partitioner.class);
            jobConf.setInputFormat(CustomV1InputFormat.class);
            jobConf.setOutputFormat(CustomV1OutputFormat.class);
        }

        Job job = Job.getInstance(jobConf);

        GridHadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI);

        if (useNewAPI) {
            job.setPartitionerClass(CustomV2Partitioner.class);
            job.setInputFormatClass(CustomV2InputFormat.class);
            job.setOutputFormatClass(CustomV2OutputFormat.class);
        }

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString()));
        FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT));

        job.setNumReduceTasks(3);

        job.setJarByClass(GridHadoopWordCount2.class);

        IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 1),
                createJobInfo(job.getConfiguration()));

        fut.get();

        assertTrue("Serialization was configured (new API is " + useNewAPI + ")",
                flags.get("serializationWasConfigured"));

        assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")",
                flags.get("partitionerWasConfigured"));

        assertTrue("Input format was configured (new API is = " + useNewAPI + ")",
                flags.get("inputFormatWasConfigured"));

        assertTrue("Output format was configured (new API is = " + useNewAPI + ")",
                flags.get("outputFormatWasConfigured"));

        assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000"));

        assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001"));

        assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002"));

    }
}

From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopMapReduceTest.java

License:Apache License

/**
 * Tests whole job execution with all phases in all combination of new and old versions of API.
 * @throws Exception If fails./*from  w  w  w  .j a va 2s .c om*/
 */
public void testWholeMapReduceExecution() throws Exception {
    IgfsPath inDir = new IgfsPath(PATH_INPUT);

    igfs.mkdirs(inDir);

    IgfsPath inFile = new IgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input");

    generateTestFile(inFile.toString(), "red", 100000, "blue", 200000, "green", 150000, "yellow", 70000);

    for (int i = 0; i < 8; i++) {
        igfs.delete(new IgfsPath(PATH_OUTPUT), true);

        boolean useNewMapper = (i & 1) == 0;
        boolean useNewCombiner = (i & 2) == 0;
        boolean useNewReducer = (i & 4) == 0;

        JobConf jobConf = new JobConf();

        jobConf.set(JOB_COUNTER_WRITER_PROPERTY, GridHadoopFSCounterWriter.class.getName());
        jobConf.setUser("yyy");
        jobConf.set(GridHadoopFSCounterWriter.COUNTER_WRITER_DIR_PROPERTY, "/xxx/${USER}/zzz");

        //To split into about 40 items for v2
        jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

        //For v1
        jobConf.setInt("fs.local.block.size", 65000);

        // File system coordinates.
        setupFileSystems(jobConf);

        GridHadoopWordCount1.setTasksClasses(jobConf, !useNewMapper, !useNewCombiner, !useNewReducer);

        Job job = Job.getInstance(jobConf);

        GridHadoopWordCount2.setTasksClasses(job, useNewMapper, useNewCombiner, useNewReducer);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString()));
        FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT));

        job.setJarByClass(GridHadoopWordCount2.class);

        GridHadoopJobId jobId = new GridHadoopJobId(UUID.randomUUID(), 1);

        IgniteInternalFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration()));

        fut.get();

        checkJobStatistics(jobId);

        assertEquals(
                "Use new mapper: " + useNewMapper + ", new combiner: " + useNewCombiner + ", new reducer: "
                        + useNewReducer,
                "blue\t200000\n" + "green\t150000\n" + "red\t100000\n" + "yellow\t70000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewReducer ? "part-r-" : "part-") + "00000"));
    }
}

From source file:org.apache.ignite.internal.processors.hadoop.HadoopAbstractMapReduceTest.java

License:Apache License

/**
 * Does actual test job/*from  ww  w  .j  a  va  2s  .  c  o  m*/
 *
 * @param useNewMapper flag to use new mapper API.
 * @param useNewCombiner flag to use new combiner API.
 * @param useNewReducer flag to use new reducer API.
 */
protected final void doTest(IgfsPath inFile, boolean useNewMapper, boolean useNewCombiner,
        boolean useNewReducer) throws Exception {
    igfs.delete(new IgfsPath(PATH_OUTPUT), true);

    JobConf jobConf = new JobConf();

    jobConf.set(JOB_COUNTER_WRITER_PROPERTY, IgniteHadoopFileSystemCounterWriter.class.getName());
    jobConf.setUser(USER);
    jobConf.set(IgniteHadoopFileSystemCounterWriter.COUNTER_WRITER_DIR_PROPERTY, "/xxx/${USER}/zzz");

    //To split into about 40 items for v2
    jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

    //For v1
    jobConf.setInt("fs.local.block.size", 65000);

    // File system coordinates.
    setupFileSystems(jobConf);

    HadoopWordCount1.setTasksClasses(jobConf, !useNewMapper, !useNewCombiner, !useNewReducer);

    Job job = Job.getInstance(jobConf);

    HadoopWordCount2.setTasksClasses(job, useNewMapper, useNewCombiner, useNewReducer, compressOutputSnappy());

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString()));
    FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT));

    job.setJarByClass(HadoopWordCount2.class);

    HadoopJobId jobId = new HadoopJobId(UUID.randomUUID(), 1);

    IgniteInternalFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration()));

    fut.get();

    checkJobStatistics(jobId);

    final String outFile = PATH_OUTPUT + "/" + (useNewReducer ? "part-r-" : "part-") + "00000";

    checkOwner(new IgfsPath(PATH_OUTPUT + "/" + "_SUCCESS"));

    checkOwner(new IgfsPath(outFile));

    String actual = readAndSortFile(outFile, job.getConfiguration());

    assertEquals(
            "Use new mapper: " + useNewMapper + ", new combiner: " + useNewCombiner + ", new reducer: "
                    + useNewReducer,
            "blue\t" + blue + "\n" + "green\t" + green + "\n" + "red\t" + red + "\n" + "yellow\t" + yellow
                    + "\n",
            actual);
}

From source file:org.apache.ignite.internal.processors.hadoop.HadoopMapReduceEmbeddedSelfTest.java

License:Apache License

/**
 * Tests whole job execution with all phases in old and new versions of API with definition of custom
 * Serialization, Partitioner and IO formats.
 * @throws Exception If fails.//ww  w  .  jav a2  s . com
 */
public void testMultiReducerWholeMapReduceExecution() throws Exception {
    IgfsPath inDir = new IgfsPath(PATH_INPUT);

    igfs.mkdirs(inDir);

    IgfsPath inFile = new IgfsPath(inDir, HadoopWordCount2.class.getSimpleName() + "-input");

    generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5",
            12000, "key6", 18000);

    for (int i = 0; i < 2; i++) {
        boolean useNewAPI = i == 1;

        igfs.delete(new IgfsPath(PATH_OUTPUT), true);

        flags.put("serializationWasConfigured", false);
        flags.put("partitionerWasConfigured", false);
        flags.put("inputFormatWasConfigured", false);
        flags.put("outputFormatWasConfigured", false);

        JobConf jobConf = new JobConf();

        jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName());

        //To split into about 6-7 items for v2
        jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

        //For v1
        jobConf.setInt("fs.local.block.size", 65000);

        // File system coordinates.
        setupFileSystems(jobConf);

        HadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI);

        if (!useNewAPI) {
            jobConf.setPartitionerClass(CustomV1Partitioner.class);
            jobConf.setInputFormat(CustomV1InputFormat.class);
            jobConf.setOutputFormat(CustomV1OutputFormat.class);
        }

        Job job = Job.getInstance(jobConf);

        HadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI);

        if (useNewAPI) {
            job.setPartitionerClass(CustomV2Partitioner.class);
            job.setInputFormatClass(CustomV2InputFormat.class);
            job.setOutputFormatClass(CustomV2OutputFormat.class);
        }

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString()));
        FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT));

        job.setNumReduceTasks(3);

        job.setJarByClass(HadoopWordCount2.class);

        IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 1),
                createJobInfo(job.getConfiguration()));

        fut.get();

        assertTrue("Serialization was configured (new API is " + useNewAPI + ")",
                flags.get("serializationWasConfigured"));

        assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")",
                flags.get("partitionerWasConfigured"));

        assertTrue("Input format was configured (new API is = " + useNewAPI + ")",
                flags.get("inputFormatWasConfigured"));

        assertTrue("Output format was configured (new API is = " + useNewAPI + ")",
                flags.get("outputFormatWasConfigured"));

        assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000"));

        assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001"));

        assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002"));

    }
}

From source file:org.apache.ignite.internal.processors.hadoop.HadoopMapReduceTest.java

License:Apache License

/**
 * Tests whole job execution with all phases in all combination of new and old versions of API.
 * @throws Exception If fails.//from   w  w w .  j  a  v  a 2s . c  om
 */
public void testWholeMapReduceExecution() throws Exception {
    IgfsPath inDir = new IgfsPath(PATH_INPUT);

    igfs.mkdirs(inDir);

    IgfsPath inFile = new IgfsPath(inDir, HadoopWordCount2.class.getSimpleName() + "-input");

    generateTestFile(inFile.toString(), "red", 100000, "blue", 200000, "green", 150000, "yellow", 70000);

    for (int i = 0; i < 8; i++) {
        igfs.delete(new IgfsPath(PATH_OUTPUT), true);

        boolean useNewMapper = (i & 1) == 0;
        boolean useNewCombiner = (i & 2) == 0;
        boolean useNewReducer = (i & 4) == 0;

        JobConf jobConf = new JobConf();

        jobConf.set(JOB_COUNTER_WRITER_PROPERTY, IgniteHadoopFileSystemCounterWriter.class.getName());
        jobConf.setUser("yyy");
        jobConf.set(IgniteHadoopFileSystemCounterWriter.COUNTER_WRITER_DIR_PROPERTY, "/xxx/${USER}/zzz");

        //To split into about 40 items for v2
        jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

        //For v1
        jobConf.setInt("fs.local.block.size", 65000);

        // File system coordinates.
        setupFileSystems(jobConf);

        HadoopWordCount1.setTasksClasses(jobConf, !useNewMapper, !useNewCombiner, !useNewReducer);

        Job job = Job.getInstance(jobConf);

        HadoopWordCount2.setTasksClasses(job, useNewMapper, useNewCombiner, useNewReducer);

        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString()));
        FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT));

        job.setJarByClass(HadoopWordCount2.class);

        HadoopJobId jobId = new HadoopJobId(UUID.randomUUID(), 1);

        IgniteInternalFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration()));

        fut.get();

        checkJobStatistics(jobId);

        assertEquals(
                "Use new mapper: " + useNewMapper + ", new combiner: " + useNewCombiner + ", new reducer: "
                        + useNewReducer,
                "blue\t200000\n" + "green\t150000\n" + "red\t100000\n" + "yellow\t70000\n",
                readAndSortFile(PATH_OUTPUT + "/" + (useNewReducer ? "part-r-" : "part-") + "00000"));
    }
}

From source file:org.apache.ignite.internal.processors.hadoop.impl.HadoopAbstractMapReduceTest.java

License:Apache License

/**
 * Does actual test job/*from  w  w w.jav a 2 s . co m*/
 *
 * @param useNewMapper flag to use new mapper API.
 * @param useNewCombiner flag to use new combiner API.
 * @param useNewReducer flag to use new reducer API.
 */
protected final void doTest(IgfsPath inFile, boolean useNewMapper, boolean useNewCombiner,
        boolean useNewReducer) throws Exception {
    log.info("useNewMapper=" + useNewMapper + ", useNewCombiner=" + useNewCombiner + ", useNewReducer="
            + useNewReducer);

    igfs.delete(new IgfsPath(PATH_OUTPUT), true);

    JobConf jobConf = new JobConf();

    jobConf.set(HadoopCommonUtils.JOB_COUNTER_WRITER_PROPERTY,
            IgniteHadoopFileSystemCounterWriter.class.getName());
    jobConf.setUser(USER);
    jobConf.set(IgniteHadoopFileSystemCounterWriter.COUNTER_WRITER_DIR_PROPERTY, "/xxx/${USER}/zzz");

    //To split into about 40 items for v2
    jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000);

    //For v1
    jobConf.setInt("fs.local.block.size", 65000);

    // File system coordinates.
    setupFileSystems(jobConf);

    HadoopWordCount1.setTasksClasses(jobConf, !useNewMapper, !useNewCombiner, !useNewReducer);

    Job job = Job.getInstance(jobConf);

    HadoopWordCount2.setTasksClasses(job, useNewMapper, useNewCombiner, useNewReducer, compressOutputSnappy());

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(IntWritable.class);

    FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString()));
    FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT));

    job.setJarByClass(HadoopWordCount2.class);

    HadoopJobId jobId = new HadoopJobId(UUID.randomUUID(), 1);

    IgniteInternalFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration()));

    fut.get();

    checkJobStatistics(jobId);

    final String outFile = PATH_OUTPUT + "/" + (useNewReducer ? "part-r-" : "part-") + "00000";

    checkOwner(new IgfsPath(PATH_OUTPUT + "/" + "_SUCCESS"));

    checkOwner(new IgfsPath(outFile));

    String actual = readAndSortFile(outFile, job.getConfiguration());

    assertEquals(
            "Use new mapper: " + useNewMapper + ", new combiner: " + useNewCombiner + ", new reducer: "
                    + useNewReducer,
            "blue\t" + blue + "\n" + "green\t" + green + "\n" + "red\t" + red + "\n" + "yellow\t" + yellow
                    + "\n",
            actual);
}

Example usage for org.apache.hadoop.mapreduce.lib.input FileInputFormat SPLIT_MAXSIZE

Introduction

Prototype

Usage