List of usage examples for org.apache.hadoop.mapreduce.lib.input FileInputFormat SPLIT_MAXSIZE
String SPLIT_MAXSIZE
To view the source code for org.apache.hadoop.mapreduce.lib.input FileInputFormat SPLIT_MAXSIZE.
Click Source Link
From source file:com.msd.gin.halyard.tools.HalyardBulkLoad.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: bulkload [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D" + SKIP_INVALID_PROPERTY + "=true] [-D" + SPLIT_BITS_PROPERTY + "=8] [-D" + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY + "=true] <input_path(s)> <output_path> <table_name>"); return -1; }//from w w w . j a va2s . c o m TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class, RDFFormat.class, RDFParser.class); HBaseConfiguration.addHbaseResources(getConf()); if (SnappyCodec.isNativeCodeLoaded()) { getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); } getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0); getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l); getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100); getConf().setInt(MRJobConfig.IO_SORT_MB, 1000); getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000); getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048); Job job = Job.getInstance(getConf(), "HalyardBulkLoad -> " + args[1] + " -> " + args[2]); job.setJarByClass(HalyardBulkLoad.class); job.setMapperClass(RDFMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setInputFormatClass(RioFileInputFormat.class); job.setSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); Map<String, Integer> contextSplitsMap = new HashMap<>(); for (Map.Entry<String, String> me : getConf().getValByRegex(CONTEXT_SPLIT_REGEXP).entrySet()) { int splits = Integer.parseInt(me.getKey().substring(me.getKey().lastIndexOf('.') + 1)); StringTokenizer stk = new StringTokenizer(me.getValue(), ","); while (stk.hasMoreTokens()) { contextSplitsMap.put(stk.nextToken(), splits); } } try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], true, getConf().getInt(SPLIT_BITS_PROPERTY, 3), contextSplitsMap)) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable); LOG.info("Bulk Load Completed.."); return 0; } } return -1; }
From source file:com.msd.gin.halyard.tools.HalyardBulkUpdate.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: bulkupdate [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D" + DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + OVERRIDE_CONTEXT_PROPERTY + "=true] <input_file_with_SPARQL_queries> <output_path> <table_name>"); return -1; }// ww w . j a v a 2 s . c o m TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, RDFFormat.class, RDFParser.class); HBaseConfiguration.addHbaseResources(getConf()); if (SnappyCodec.isNativeCodeLoaded()) { getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); } getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0); getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l); getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100); getConf().setInt(MRJobConfig.IO_SORT_MB, 1000); getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000); getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048); getConf().setStrings(TABLE_NAME_PROPERTY, args[2]); Job job = Job.getInstance(getConf(), "HalyardBulkUpdate -> " + args[1] + " -> " + args[2]); NLineInputFormat.setNumLinesPerSplit(job, 1); job.setJarByClass(HalyardBulkUpdate.class); job.setMapperClass(SPARQLMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setInputFormatClass(NLineInputFormat.class); job.setSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], false, 0, null)) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable); LOG.info("Bulk Update Completed.."); return 0; } } return -1; }
From source file:com.msd.gin.halyard.tools.HalyardHiveLoad.java
License:Apache License
@Override public int run(String[] args) throws Exception { if (args.length != 3) { System.err.println("Usage: hiveload -D" + RDF_MIME_TYPE_PROPERTY + "='application/ld+json' [-D" + MRJobConfig.QUEUE_NAME + "=proofofconcepts] [-D" + HIVE_DATA_COLUMN_INDEX_PROPERTY + "=3] [-D" + BASE_URI_PROPERTY + "='http://my_base_uri/'] [-D" + HalyardBulkLoad.SPLIT_BITS_PROPERTY + "=8] [-D" + HalyardBulkLoad.DEFAULT_CONTEXT_PROPERTY + "=http://new_context] [-D" + HalyardBulkLoad.OVERRIDE_CONTEXT_PROPERTY + "=true] <hive_table_name> <output_path> <hbase_table_name>"); return -1; }/*from w w w. j a v a 2s . c o m*/ TableMapReduceUtil.addDependencyJars(getConf(), NTriplesUtil.class, Rio.class, AbstractRDFHandler.class, RDFFormat.class, RDFParser.class); HBaseConfiguration.addHbaseResources(getConf()); if (SnappyCodec.isNativeCodeLoaded()) { getConf().setBoolean(MRJobConfig.MAP_OUTPUT_COMPRESS, true); getConf().setClass(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, SnappyCodec.class, CompressionCodec.class); } getConf().setDouble(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, 1.0); getConf().setLong(MRJobConfig.TASK_TIMEOUT, 3600000l); getConf().setInt(MRJobConfig.IO_SORT_FACTOR, 100); getConf().setInt(MRJobConfig.IO_SORT_MB, 1000); getConf().setInt(FileInputFormat.SPLIT_MAXSIZE, 1000000000); getConf().setInt(LoadIncrementalHFiles.MAX_FILES_PER_REGION_PER_FAMILY, 2048); Job job = Job.getInstance(getConf(), "HalyardHiveLoad -> " + args[1] + " -> " + args[2]); int i = args[0].indexOf('.'); HCatInputFormat.setInput(job, i > 0 ? args[0].substring(0, i) : null, args[0].substring(i + 1)); job.setJarByClass(HalyardHiveLoad.class); job.setMapperClass(HiveMapper.class); job.setMapOutputKeyClass(ImmutableBytesWritable.class); job.setMapOutputValueClass(KeyValue.class); job.setInputFormatClass(HCatInputFormat.class); job.setSpeculativeExecution(false); job.setReduceSpeculativeExecution(false); Map<String, Integer> contextSplitsMap = new HashMap<>(); for (Map.Entry<String, String> me : getConf().getValByRegex(HalyardBulkLoad.CONTEXT_SPLIT_REGEXP) .entrySet()) { int splits = Integer.parseInt(me.getKey().substring(me.getKey().lastIndexOf('.') + 1)); StringTokenizer stk = new StringTokenizer(me.getValue(), ","); while (stk.hasMoreTokens()) { contextSplitsMap.put(stk.nextToken(), splits); } } try (HTable hTable = HalyardTableUtils.getTable(getConf(), args[2], true, getConf().getInt(HalyardBulkLoad.SPLIT_BITS_PROPERTY, 3), contextSplitsMap)) { HFileOutputFormat2.configureIncrementalLoad(job, hTable.getTableDescriptor(), hTable.getRegionLocator()); FileInputFormat.setInputDirRecursive(job, true); FileInputFormat.setInputPaths(job, args[0]); FileOutputFormat.setOutputPath(job, new Path(args[1])); TableMapReduceUtil.addDependencyJars(job); TableMapReduceUtil.initCredentials(job); if (job.waitForCompletion(true)) { new LoadIncrementalHFiles(getConf()).doBulkLoad(new Path(args[1]), hTable); LOG.info("Bulk Load Completed.."); return 0; } } return -1; }
From source file:com.streamsets.pipeline.stage.origin.hdfs.cluster.ClusterHdfsSource.java
License:Apache License
@Override public List<ConfigIssue> init() { List<ConfigIssue> issues = super.init(); validateHadoopFS(issues);// w w w . ja va 2 s . c o m // This is for getting no of splits - no of executors hadoopConf.set(FileInputFormat.LIST_STATUS_NUM_THREADS, "5"); // Per Hive-on-Spark hadoopConf.set(FileInputFormat.SPLIT_MAXSIZE, String.valueOf(750000000)); // Per Hive-on-Spark for (Map.Entry<String, String> config : hdfsConfigs.entrySet()) { hadoopConf.set(config.getKey(), config.getValue()); } List<Path> hdfsDirPaths = new ArrayList<>(); if (hdfsDirLocations == null || hdfsDirLocations.isEmpty()) { issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_18)); } else if (issues.isEmpty()) { for (String hdfsDirLocation : hdfsDirLocations) { try { FileSystem fs = getFileSystemForInitDestroy(); Path ph = fs.makeQualified(new Path(hdfsDirLocation)); hdfsDirPaths.add(ph); if (!fs.exists(ph)) { issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_10, hdfsDirLocation)); } else if (!fs.getFileStatus(ph).isDirectory()) { issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_15, hdfsDirLocation)); } else { try { FileStatus[] files = fs.listStatus(ph); if (files == null || files.length == 0) { issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_16, hdfsDirLocation)); } else if (getContext().isPreview() && previewBuffer.size() < PREVIEW_SIZE) { for (FileStatus fileStatus : files) { if (fileStatus.isFile()) { String path = fileStatus.getPath().toString(); try { List<Map.Entry> buffer; if (dataFormat == DataFormat.AVRO) { buffer = previewAvroBatch(fileStatus, PREVIEW_SIZE); } else { buffer = previewTextBatch(fileStatus, PREVIEW_SIZE); } for (int i = 0; i < buffer.size() && previewBuffer.size() < PREVIEW_SIZE; i++) { Map.Entry entry = buffer.get(i); previewBuffer.put(String.valueOf(entry.getKey()), entry.getValue() == null ? null : entry.getValue()); } } catch (IOException | InterruptedException ex) { String msg = "Error opening " + path + ": " + ex; LOG.info(msg, ex); issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_16, fileStatus.getPath())); } } } } } catch (IOException ex) { issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_09, hdfsDirLocation, ex.toString(), ex)); } } } catch (IOException ioe) { LOG.warn("Error connecting to HDFS filesystem: " + ioe, ioe); issues.add(getContext().createConfigIssue(Groups.HADOOP_FS.name(), "hdfsDirLocations", Errors.HADOOPFS_11, hdfsDirLocation, ioe.toString(), ioe)); } } } hadoopConf.set(FileInputFormat.INPUT_DIR, StringUtils.join(hdfsDirPaths, ",")); hadoopConf.set(FileInputFormat.INPUT_DIR_RECURSIVE, Boolean.toString(recursive)); switch (dataFormat) { case JSON: if (jsonMaxObjectLen < 1) { issues.add( getContext().createConfigIssue(Groups.JSON.name(), "jsonMaxObjectLen", Errors.HADOOPFS_04)); } break; case TEXT: if (textMaxLineLen < 1) { issues.add( getContext().createConfigIssue(Groups.TEXT.name(), "textMaxLineLen", Errors.HADOOPFS_05)); } break; case LOG: logDataFormatValidator = new LogDataFormatValidator(logMode, logMaxObjectLen, retainOriginalLine, customLogFormat, regex, grokPatternDefinition, grokPattern, enableLog4jCustomLogFormat, log4jCustomLogFormat, OnParseError.ERROR, 0, Groups.LOG.name(), getFieldPathToGroupMap(fieldPathsToGroupName)); logDataFormatValidator.validateLogFormatConfig(issues, getContext()); break; case DELIMITED: if (csvMaxObjectLen < 1) { issues.add(getContext().createConfigIssue(Groups.DELIMITED.name(), "csvMaxObjectLen", Errors.HADOOPFS_30)); } break; case AVRO: if (avroSchema != null && !avroSchema.isEmpty()) { hadoopConf.set(AvroJob.INPUT_SCHEMA, avroSchema); hadoopConf.set(CONF_INPUT_KEY_SCHEMA, avroSchema); } break; default: issues.add(getContext().createConfigIssue(Groups.LOG.name(), "dataFormat", Errors.HADOOPFS_06, dataFormat)); } validateParserFactoryConfigs(issues); LOG.info("Issues: " + issues); return issues; }
From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopMapReduceEmbeddedSelfTest.java
License:Apache License
/** * Tests whole job execution with all phases in old and new versions of API with definition of custom * Serialization, Partitioner and IO formats. * @throws Exception If fails.//from w ww. j av a2s. c om */ public void testMultiReducerWholeMapReduceExecution() throws Exception { IgfsPath inDir = new IgfsPath(PATH_INPUT); igfs.mkdirs(inDir); IgfsPath inFile = new IgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input"); generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5", 12000, "key6", 18000); for (int i = 0; i < 2; i++) { boolean useNewAPI = i == 1; igfs.delete(new IgfsPath(PATH_OUTPUT), true); flags.put("serializationWasConfigured", false); flags.put("partitionerWasConfigured", false); flags.put("inputFormatWasConfigured", false); flags.put("outputFormatWasConfigured", false); JobConf jobConf = new JobConf(); jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName()); //To split into about 6-7 items for v2 jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000); //For v1 jobConf.setInt("fs.local.block.size", 65000); // File system coordinates. setupFileSystems(jobConf); GridHadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI); if (!useNewAPI) { jobConf.setPartitionerClass(CustomV1Partitioner.class); jobConf.setInputFormat(CustomV1InputFormat.class); jobConf.setOutputFormat(CustomV1OutputFormat.class); } Job job = Job.getInstance(jobConf); GridHadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI); if (useNewAPI) { job.setPartitionerClass(CustomV2Partitioner.class); job.setInputFormatClass(CustomV2InputFormat.class); job.setOutputFormatClass(CustomV2OutputFormat.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString())); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT)); job.setNumReduceTasks(3); job.setJarByClass(GridHadoopWordCount2.class); IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new GridHadoopJobId(UUID.randomUUID(), 1), createJobInfo(job.getConfiguration())); fut.get(); assertTrue("Serialization was configured (new API is " + useNewAPI + ")", flags.get("serializationWasConfigured")); assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")", flags.get("partitionerWasConfigured")); assertTrue("Input format was configured (new API is = " + useNewAPI + ")", flags.get("inputFormatWasConfigured")); assertTrue("Output format was configured (new API is = " + useNewAPI + ")", flags.get("outputFormatWasConfigured")); assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000")); assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001")); assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002")); } }
From source file:org.apache.ignite.internal.processors.hadoop.GridHadoopMapReduceTest.java
License:Apache License
/** * Tests whole job execution with all phases in all combination of new and old versions of API. * @throws Exception If fails./*from w w w .j a va 2s .c om*/ */ public void testWholeMapReduceExecution() throws Exception { IgfsPath inDir = new IgfsPath(PATH_INPUT); igfs.mkdirs(inDir); IgfsPath inFile = new IgfsPath(inDir, GridHadoopWordCount2.class.getSimpleName() + "-input"); generateTestFile(inFile.toString(), "red", 100000, "blue", 200000, "green", 150000, "yellow", 70000); for (int i = 0; i < 8; i++) { igfs.delete(new IgfsPath(PATH_OUTPUT), true); boolean useNewMapper = (i & 1) == 0; boolean useNewCombiner = (i & 2) == 0; boolean useNewReducer = (i & 4) == 0; JobConf jobConf = new JobConf(); jobConf.set(JOB_COUNTER_WRITER_PROPERTY, GridHadoopFSCounterWriter.class.getName()); jobConf.setUser("yyy"); jobConf.set(GridHadoopFSCounterWriter.COUNTER_WRITER_DIR_PROPERTY, "/xxx/${USER}/zzz"); //To split into about 40 items for v2 jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000); //For v1 jobConf.setInt("fs.local.block.size", 65000); // File system coordinates. setupFileSystems(jobConf); GridHadoopWordCount1.setTasksClasses(jobConf, !useNewMapper, !useNewCombiner, !useNewReducer); Job job = Job.getInstance(jobConf); GridHadoopWordCount2.setTasksClasses(job, useNewMapper, useNewCombiner, useNewReducer); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString())); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT)); job.setJarByClass(GridHadoopWordCount2.class); GridHadoopJobId jobId = new GridHadoopJobId(UUID.randomUUID(), 1); IgniteInternalFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration())); fut.get(); checkJobStatistics(jobId); assertEquals( "Use new mapper: " + useNewMapper + ", new combiner: " + useNewCombiner + ", new reducer: " + useNewReducer, "blue\t200000\n" + "green\t150000\n" + "red\t100000\n" + "yellow\t70000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewReducer ? "part-r-" : "part-") + "00000")); } }
From source file:org.apache.ignite.internal.processors.hadoop.HadoopAbstractMapReduceTest.java
License:Apache License
/** * Does actual test job/*from ww w .j a va 2s . c o m*/ * * @param useNewMapper flag to use new mapper API. * @param useNewCombiner flag to use new combiner API. * @param useNewReducer flag to use new reducer API. */ protected final void doTest(IgfsPath inFile, boolean useNewMapper, boolean useNewCombiner, boolean useNewReducer) throws Exception { igfs.delete(new IgfsPath(PATH_OUTPUT), true); JobConf jobConf = new JobConf(); jobConf.set(JOB_COUNTER_WRITER_PROPERTY, IgniteHadoopFileSystemCounterWriter.class.getName()); jobConf.setUser(USER); jobConf.set(IgniteHadoopFileSystemCounterWriter.COUNTER_WRITER_DIR_PROPERTY, "/xxx/${USER}/zzz"); //To split into about 40 items for v2 jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000); //For v1 jobConf.setInt("fs.local.block.size", 65000); // File system coordinates. setupFileSystems(jobConf); HadoopWordCount1.setTasksClasses(jobConf, !useNewMapper, !useNewCombiner, !useNewReducer); Job job = Job.getInstance(jobConf); HadoopWordCount2.setTasksClasses(job, useNewMapper, useNewCombiner, useNewReducer, compressOutputSnappy()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString())); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT)); job.setJarByClass(HadoopWordCount2.class); HadoopJobId jobId = new HadoopJobId(UUID.randomUUID(), 1); IgniteInternalFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration())); fut.get(); checkJobStatistics(jobId); final String outFile = PATH_OUTPUT + "/" + (useNewReducer ? "part-r-" : "part-") + "00000"; checkOwner(new IgfsPath(PATH_OUTPUT + "/" + "_SUCCESS")); checkOwner(new IgfsPath(outFile)); String actual = readAndSortFile(outFile, job.getConfiguration()); assertEquals( "Use new mapper: " + useNewMapper + ", new combiner: " + useNewCombiner + ", new reducer: " + useNewReducer, "blue\t" + blue + "\n" + "green\t" + green + "\n" + "red\t" + red + "\n" + "yellow\t" + yellow + "\n", actual); }
From source file:org.apache.ignite.internal.processors.hadoop.HadoopMapReduceEmbeddedSelfTest.java
License:Apache License
/** * Tests whole job execution with all phases in old and new versions of API with definition of custom * Serialization, Partitioner and IO formats. * @throws Exception If fails.//ww w . jav a2 s . com */ public void testMultiReducerWholeMapReduceExecution() throws Exception { IgfsPath inDir = new IgfsPath(PATH_INPUT); igfs.mkdirs(inDir); IgfsPath inFile = new IgfsPath(inDir, HadoopWordCount2.class.getSimpleName() + "-input"); generateTestFile(inFile.toString(), "key1", 10000, "key2", 20000, "key3", 15000, "key4", 7000, "key5", 12000, "key6", 18000); for (int i = 0; i < 2; i++) { boolean useNewAPI = i == 1; igfs.delete(new IgfsPath(PATH_OUTPUT), true); flags.put("serializationWasConfigured", false); flags.put("partitionerWasConfigured", false); flags.put("inputFormatWasConfigured", false); flags.put("outputFormatWasConfigured", false); JobConf jobConf = new JobConf(); jobConf.set(CommonConfigurationKeys.IO_SERIALIZATIONS_KEY, CustomSerialization.class.getName()); //To split into about 6-7 items for v2 jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000); //For v1 jobConf.setInt("fs.local.block.size", 65000); // File system coordinates. setupFileSystems(jobConf); HadoopWordCount1.setTasksClasses(jobConf, !useNewAPI, !useNewAPI, !useNewAPI); if (!useNewAPI) { jobConf.setPartitionerClass(CustomV1Partitioner.class); jobConf.setInputFormat(CustomV1InputFormat.class); jobConf.setOutputFormat(CustomV1OutputFormat.class); } Job job = Job.getInstance(jobConf); HadoopWordCount2.setTasksClasses(job, useNewAPI, useNewAPI, useNewAPI); if (useNewAPI) { job.setPartitionerClass(CustomV2Partitioner.class); job.setInputFormatClass(CustomV2InputFormat.class); job.setOutputFormatClass(CustomV2OutputFormat.class); } job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString())); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT)); job.setNumReduceTasks(3); job.setJarByClass(HadoopWordCount2.class); IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 1), createJobInfo(job.getConfiguration())); fut.get(); assertTrue("Serialization was configured (new API is " + useNewAPI + ")", flags.get("serializationWasConfigured")); assertTrue("Partitioner was configured (new API is = " + useNewAPI + ")", flags.get("partitionerWasConfigured")); assertTrue("Input format was configured (new API is = " + useNewAPI + ")", flags.get("inputFormatWasConfigured")); assertTrue("Output format was configured (new API is = " + useNewAPI + ")", flags.get("outputFormatWasConfigured")); assertEquals("Use new API = " + useNewAPI, "key3\t15000\n" + "key6\t18000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00000")); assertEquals("Use new API = " + useNewAPI, "key1\t10000\n" + "key4\t7000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00001")); assertEquals("Use new API = " + useNewAPI, "key2\t20000\n" + "key5\t12000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewAPI ? "part-r-" : "part-") + "00002")); } }
From source file:org.apache.ignite.internal.processors.hadoop.HadoopMapReduceTest.java
License:Apache License
/** * Tests whole job execution with all phases in all combination of new and old versions of API. * @throws Exception If fails.//from w w w . j a v a 2s . c om */ public void testWholeMapReduceExecution() throws Exception { IgfsPath inDir = new IgfsPath(PATH_INPUT); igfs.mkdirs(inDir); IgfsPath inFile = new IgfsPath(inDir, HadoopWordCount2.class.getSimpleName() + "-input"); generateTestFile(inFile.toString(), "red", 100000, "blue", 200000, "green", 150000, "yellow", 70000); for (int i = 0; i < 8; i++) { igfs.delete(new IgfsPath(PATH_OUTPUT), true); boolean useNewMapper = (i & 1) == 0; boolean useNewCombiner = (i & 2) == 0; boolean useNewReducer = (i & 4) == 0; JobConf jobConf = new JobConf(); jobConf.set(JOB_COUNTER_WRITER_PROPERTY, IgniteHadoopFileSystemCounterWriter.class.getName()); jobConf.setUser("yyy"); jobConf.set(IgniteHadoopFileSystemCounterWriter.COUNTER_WRITER_DIR_PROPERTY, "/xxx/${USER}/zzz"); //To split into about 40 items for v2 jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000); //For v1 jobConf.setInt("fs.local.block.size", 65000); // File system coordinates. setupFileSystems(jobConf); HadoopWordCount1.setTasksClasses(jobConf, !useNewMapper, !useNewCombiner, !useNewReducer); Job job = Job.getInstance(jobConf); HadoopWordCount2.setTasksClasses(job, useNewMapper, useNewCombiner, useNewReducer); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString())); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT)); job.setJarByClass(HadoopWordCount2.class); HadoopJobId jobId = new HadoopJobId(UUID.randomUUID(), 1); IgniteInternalFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration())); fut.get(); checkJobStatistics(jobId); assertEquals( "Use new mapper: " + useNewMapper + ", new combiner: " + useNewCombiner + ", new reducer: " + useNewReducer, "blue\t200000\n" + "green\t150000\n" + "red\t100000\n" + "yellow\t70000\n", readAndSortFile(PATH_OUTPUT + "/" + (useNewReducer ? "part-r-" : "part-") + "00000")); } }
From source file:org.apache.ignite.internal.processors.hadoop.impl.HadoopAbstractMapReduceTest.java
License:Apache License
/** * Does actual test job/*from w w w.jav a 2 s . co m*/ * * @param useNewMapper flag to use new mapper API. * @param useNewCombiner flag to use new combiner API. * @param useNewReducer flag to use new reducer API. */ protected final void doTest(IgfsPath inFile, boolean useNewMapper, boolean useNewCombiner, boolean useNewReducer) throws Exception { log.info("useNewMapper=" + useNewMapper + ", useNewCombiner=" + useNewCombiner + ", useNewReducer=" + useNewReducer); igfs.delete(new IgfsPath(PATH_OUTPUT), true); JobConf jobConf = new JobConf(); jobConf.set(HadoopCommonUtils.JOB_COUNTER_WRITER_PROPERTY, IgniteHadoopFileSystemCounterWriter.class.getName()); jobConf.setUser(USER); jobConf.set(IgniteHadoopFileSystemCounterWriter.COUNTER_WRITER_DIR_PROPERTY, "/xxx/${USER}/zzz"); //To split into about 40 items for v2 jobConf.setInt(FileInputFormat.SPLIT_MAXSIZE, 65000); //For v1 jobConf.setInt("fs.local.block.size", 65000); // File system coordinates. setupFileSystems(jobConf); HadoopWordCount1.setTasksClasses(jobConf, !useNewMapper, !useNewCombiner, !useNewReducer); Job job = Job.getInstance(jobConf); HadoopWordCount2.setTasksClasses(job, useNewMapper, useNewCombiner, useNewReducer, compressOutputSnappy()); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); FileInputFormat.setInputPaths(job, new Path(igfsScheme() + inFile.toString())); FileOutputFormat.setOutputPath(job, new Path(igfsScheme() + PATH_OUTPUT)); job.setJarByClass(HadoopWordCount2.class); HadoopJobId jobId = new HadoopJobId(UUID.randomUUID(), 1); IgniteInternalFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration())); fut.get(); checkJobStatistics(jobId); final String outFile = PATH_OUTPUT + "/" + (useNewReducer ? "part-r-" : "part-") + "00000"; checkOwner(new IgfsPath(PATH_OUTPUT + "/" + "_SUCCESS")); checkOwner(new IgfsPath(outFile)); String actual = readAndSortFile(outFile, job.getConfiguration()); assertEquals( "Use new mapper: " + useNewMapper + ", new combiner: " + useNewCombiner + ", new reducer: " + useNewReducer, "blue\t" + blue + "\n" + "green\t" + green + "\n" + "red\t" + red + "\n" + "yellow\t" + yellow + "\n", actual); }