List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration
public Configuration getConfiguration();
From source file:com.ikanow.aleph2.analytics.r.assets.BeFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException { logger.debug("BeFileInputFormat.getSplits"); super.setMaxSplitSize(MAX_SPLIT_SIZE); try {//from w ww . j a va 2 s. c om final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> { final List<InputSplit> tmp = super.getSplits(context); String debug_max_str = context.getConfiguration().get(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE); if (null != debug_max_str) { final int requested_records = Integer.parseInt(debug_max_str); // dump 5* the request number of splits into one mega split // to strike a balance between limiting the data and making sure for // tests that enough records are generated final CombineFileSplit combined = new CombineFileSplit( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records) .<Path>toArray(size -> new Path[size]), ArrayUtils.toPrimitive( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed()) .limit(5L * requested_records).<Long>toArray(size -> new Long[size]), 0L), ArrayUtils.toPrimitive( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getLengths()).boxed()) .limit(5L * requested_records).<Long>toArray(size -> new Long[size]), 0L), tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations()))) .limit(5L * requested_records).<String>toArray(size -> new String[size])); return Arrays.<InputSplit>asList(combined); } else return tmp; })); logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null")); return splits; } catch (Throwable t) { logger.error(ErrorUtils.getLongForm("Error getting splits, error = {0}", t)); return Collections.emptyList(); } }
From source file:com.ikanow.aleph2.analytics.spark.assets.BeFileInputFormat_Pure.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext context) throws IOException { logger.debug("BeFileInputFormat.getSplits"); super.setMaxSplitSize(MAX_SPLIT_SIZE); try {/*from w w w. jav a 2 s . c o m*/ final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> { final List<InputSplit> tmp = super.getSplits(context); String debug_max_str = context.getConfiguration().get(HadoopBatchEnrichmentUtils.BE_DEBUG_MAX_SIZE); if (null != debug_max_str) { final int requested_records = Integer.parseInt(debug_max_str); // dump 5* the request number of splits into one mega split // to strike a balance between limiting the data and making sure for // tests that enough records are generated final CombineFileSplit combined = new CombineFileSplit( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records) .<Path>toArray(size -> new Path[size]), ArrayUtils.toPrimitive( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed()) .limit(5L * requested_records).<Long>toArray(size -> new Long[size]), 0L), ArrayUtils.toPrimitive( tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(split -> Arrays.stream(split.getLengths()).boxed()) .limit(5L * requested_records).<Long>toArray(size -> new Long[size]), 0L), tmp.stream().map(split -> (CombineFileSplit) split) .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations()))) .limit(5L * requested_records).<String>toArray(size -> new String[size])); return Arrays.<InputSplit>asList(combined); } else return tmp; })); logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null")); return splits; } catch (Throwable t) { logger.error(ErrorUtils.getLongForm("Error getting splits, error = {0}", t)); return Collections.emptyList(); } }
From source file:com.ikanow.aleph2.search_service.elasticsearch.hadoop.assets.Aleph2EsInputFormat.java
License:Apache License
public static List<InputSplit> getSplits(Function<JobContext, List<InputSplit>> get_splits, JobContext context) { final String[] indexes = context.getConfiguration().get(ALEPH2_RESOURCE, "").split(",,"); return Arrays.stream(indexes).<InputSplit>flatMap(Lambdas.wrap_u(index -> { context.getConfiguration().set("es.resource.read", index.replace(" ", "%20")); return get_splits.apply(context).stream(); })).collect(Collectors.<InputSplit>toList()); }
From source file:com.ikanow.aleph2.search_service.elasticsearch.hadoop.assets.TestAleph2EsInputFormat.java
License:Apache License
@Test public void test_Aleph2EsInputFormat() { new Aleph2EsInputFormat(); //code coverage! final JobContext mock_context = Mockito.mock(JobContext.class); final Configuration test_config = new Configuration(false); test_config.set("aleph2.es.resource", "test1,,test ,2"); Mockito.when(mock_context.getConfiguration()).thenReturn(test_config); final List<InputSplit> res = Aleph2EsInputFormat.getSplits(ctxt -> { final InputSplit split1 = Mockito.mock(InputSplit.class); final String es_res = ctxt.getConfiguration().get("es.resource.read").toString(); Mockito.when(split1.toString()).thenReturn(es_res); return Arrays.asList(split1); }, mock_context);// w ww . j ava2 s . c om assertEquals(Arrays.asList("test1", "test%20,2"), res.stream().map(fmt -> fmt.toString()).collect(Collectors.toList())); }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat.java
License:Apache License
@Override public List getSplits(JobContext arg0) throws IOException, InterruptedException { LinkedList<InputSplit> fullList = new LinkedList<InputSplit>(); String indexes[] = arg0.getConfiguration().get("es.resource").split("\\s*,,\\s*"); for (String index : indexes) { _delegate = new EsInputFormat(); // create a new input format for each object arg0.getConfiguration().set("es.resource.read", index.replace(" ", "%20")); // (spaces in types cause problems) @SuppressWarnings("unchecked") List<InputSplit> list = _delegate.getSplits(arg0); if (LOCAL_DEBUG_MODE) enableInputSplitDebugMode(list); fullList.addAll(list);// w w w . ja v a 2s . c o m } return fullList; }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { // Test mode needs to restrict the docs also, otherwise things can get out of hand... int debugLimit = job.getConfiguration().getInt("mongo.input.limit", Integer.MAX_VALUE); if (debugLimit <= 0) { // (just not set) debugLimit = Integer.MAX_VALUE; }/*from w ww . j a v a 2 s. c om*/ List<InputSplit> splits = super.getSplits(job); if (splits.size() > debugLimit) { splits = splits.subList(0, debugLimit); } return splits; }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext context) { final Configuration hadoopConfiguration = context.getConfiguration(); final InfiniteMongoConfig conf = new InfiniteMongoConfig(hadoopConfiguration); List<InputSplit> splits = InfiniteMongoSplitter.calculateSplits(conf); if (conf.getSelfMerge() != null) { //check if we need to grab existing records and add them to the splits final Configuration existingConfiguration = context.getConfiguration(); existingConfiguration.set("mongo.input.uri", conf.getSelfMerge()); BasicDBObject query = new BasicDBObject(); //add on this query to only get items previous to now if no reducer is specified (otherwise //we will leak any items we map on the first run back in before this split runs) if (context.getNumReduceTasks() == 0) query.put("_id", new BasicDBObject(MongoDbManager.lt_, new ObjectId())); existingConfiguration.set("mongo.input.query", query.toString()); final InfiniteMongoConfig existingConf = new InfiniteMongoConfig(existingConfiguration); splits.addAll(InfiniteMongoSplitter.calculateSplits(existingConf)); }//from www .ja v a 2 s . c o m return splits; }
From source file:com.ikanow.infinit.e.data_model.custom.InfiniteShareInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { // Test mode needs to restrict the docs also, otherwise things can get out of hand... int debugLimit = job.getConfiguration().getInt("mongo.input.limit", Integer.MAX_VALUE); if (debugLimit <= 0) { // (just not set) debugLimit = Integer.MAX_VALUE; }//from w w w . j a va 2 s. co m // Get the already authenticated list of share ids String oidStrs[] = job.getConfiguration().getStrings("mapred.input.dir"); List<InputSplit> splits = new LinkedList<InputSplit>(); int numSplits = 0; String sourceStr = job.getConfiguration().get("mongo.input.query"); SourcePojo source = ApiManager.mapFromApi(sourceStr, SourcePojo.class, null); SourceFileConfigPojo fileConfig = source.getFileConfig(); Pattern pathInclude = null; Pattern pathExclude = null; if (null != fileConfig.pathInclude) { pathInclude = Pattern.compile(fileConfig.pathInclude); } if (null != fileConfig.pathExclude) { pathExclude = Pattern.compile(fileConfig.pathExclude); } for (String oidStr : oidStrs) { try { BasicDBObject query = new BasicDBObject(SharePojo._id_, new ObjectId(oidStr)); BasicDBObject fields = new BasicDBObject(SharePojo.binaryId_, 1); fields.put(SharePojo.title_, 1); SharePojo share = SharePojo.fromDb(DbManager.getSocial().getShare().findOne(query, fields), SharePojo.class); if ((null != share) && (null != share.getBinaryId())) { GridFSRandomAccessFile file = new GridFSRandomAccessFile( MongoDbManager.getSocial().getShareBinary(), share.getBinaryId()); GridFSZipFile zipView = new GridFSZipFile(share.getTitle(), file); @SuppressWarnings("unchecked") Enumeration<net.sf.jazzlib.ZipEntry> entries = zipView.entries(); while (entries.hasMoreElements()) { net.sf.jazzlib.ZipEntry zipInfo = entries.nextElement(); if (zipInfo.isDirectory()) { continue; } if (null != pathInclude) { if (!pathInclude.matcher(zipInfo.getName()).matches()) { continue; } } if (null != pathExclude) { if (pathExclude.matcher(zipInfo.getName()).matches()) { continue; } } InfiniteShareInputSplit split = new InfiniteShareInputSplit(share.get_id(), share.getBinaryId(), zipInfo.getName(), zipInfo.getSize(), zipInfo.getTime()); splits.add(split); if (++numSplits >= debugLimit) { break; } //DEBUG //System.out.println("ADD NEW SPLIT: " + share.get_id() + " , " + share.getBinaryId() + " , " + zipInfo.getName() + " , " + zipInfo.getSize()); } } } catch (Exception e) { } // (this would be an internal logic error } return splits; }
From source file:com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat.java
License:Apache License
@Override public List<InputSplit> getSplits(JobContext context) { final Configuration hadoopConfiguration = context.getConfiguration(); final InfiniteMongoConfig conf = new InfiniteMongoConfig(hadoopConfiguration); String queryStr = hadoopConfiguration.get("mongo.input.query"); String userIdStr = hadoopConfiguration.get("infinit.e.userid"); AdvancedQueryPojo query = AdvancedQueryPojo.fromApi(queryStr, AdvancedQueryPojo.class); return calculateSplits(query, userIdStr, conf); }
From source file:com.inmobi.conduit.distcp.tools.mapred.lib.TestDynamicInputFormat.java
License:Apache License
@Test public void testGetSplits() throws Exception { DistCpOptions options = getOptions(); Configuration configuration = new Configuration(); configuration.set("mapred.map.tasks", String.valueOf(options.getMaxMaps())); CopyListing.getCopyListing(configuration, CREDENTIALS, options).buildListing( new Path(cluster.getFileSystem().getUri().toString() + "/tmp/testDynInputFormat/fileList.seq"), options);/* w ww . j a v a2s. c o m*/ JobID jobId = new JobID(); JobContext jobContext = mock(JobContext.class); when(jobContext.getConfiguration()).thenReturn(configuration); when(jobContext.getJobID()).thenReturn(jobId); DynamicInputFormat<Text, FileStatus> inputFormat = new DynamicInputFormat<Text, FileStatus>(); List<InputSplit> splits = inputFormat.getSplits(jobContext); int nFiles = 0; int taskId = 0; for (InputSplit split : splits) { TaskAttemptID tId = new TaskAttemptID("", 0, true, taskId, 0); final TaskAttemptContext taskAttemptContext = mock(TaskAttemptContext.class); when(taskAttemptContext.getConfiguration()).thenReturn(configuration); when(taskAttemptContext.getTaskAttemptID()).thenReturn(tId); RecordReader<Text, FileStatus> recordReader = inputFormat.createRecordReader(split, taskAttemptContext); recordReader.initialize(splits.get(0), taskAttemptContext); float previousProgressValue = 0f; while (recordReader.nextKeyValue()) { FileStatus fileStatus = recordReader.getCurrentValue(); String source = fileStatus.getPath().toString(); System.out.println(source); Assert.assertTrue(expectedFilePaths.contains(source)); final float progress = recordReader.getProgress(); Assert.assertTrue(progress >= previousProgressValue); Assert.assertTrue(progress >= 0.0f); Assert.assertTrue(progress <= 1.0f); previousProgressValue = progress; ++nFiles; } Assert.assertTrue(recordReader.getProgress() == 1.0f); ++taskId; } Assert.assertEquals(expectedFilePaths.size(), nFiles); }