Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

List of usage examples for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.ikanow.aleph2.analytics.r.assets.BeFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    logger.debug("BeFileInputFormat.getSplits");

    super.setMaxSplitSize(MAX_SPLIT_SIZE);

    try {//from w  ww .  j a va  2 s.  c  om
        final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> {
            final List<InputSplit> tmp = super.getSplits(context);

            String debug_max_str = context.getConfiguration().get(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE);
            if (null != debug_max_str) {
                final int requested_records = Integer.parseInt(debug_max_str);

                // dump 5* the request number of splits into one mega split
                // to strike a balance between limiting the data and making sure for 
                // tests that enough records are generated

                final CombineFileSplit combined = new CombineFileSplit(
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records)
                                .<Path>toArray(size -> new Path[size]),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getLengths()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations())))
                                .limit(5L * requested_records).<String>toArray(size -> new String[size]));
                return Arrays.<InputSplit>asList(combined);
            } else
                return tmp;
        }));

        logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null"));
        return splits;

    } catch (Throwable t) {
        logger.error(ErrorUtils.getLongForm("Error getting splits, error = {0}", t));

        return Collections.emptyList();
    }
}

From source file:com.ikanow.aleph2.analytics.spark.assets.BeFileInputFormat_Pure.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    logger.debug("BeFileInputFormat.getSplits");

    super.setMaxSplitSize(MAX_SPLIT_SIZE);

    try {/*from   w  w w. jav  a 2  s . c  o m*/
        final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> {
            final List<InputSplit> tmp = super.getSplits(context);

            String debug_max_str = context.getConfiguration().get(HadoopBatchEnrichmentUtils.BE_DEBUG_MAX_SIZE);
            if (null != debug_max_str) {
                final int requested_records = Integer.parseInt(debug_max_str);

                // dump 5* the request number of splits into one mega split
                // to strike a balance between limiting the data and making sure for 
                // tests that enough records are generated

                final CombineFileSplit combined = new CombineFileSplit(
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records)
                                .<Path>toArray(size -> new Path[size]),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getLengths()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations())))
                                .limit(5L * requested_records).<String>toArray(size -> new String[size]));
                return Arrays.<InputSplit>asList(combined);
            } else
                return tmp;
        }));

        logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null"));
        return splits;

    } catch (Throwable t) {
        logger.error(ErrorUtils.getLongForm("Error getting splits, error = {0}", t));

        return Collections.emptyList();
    }
}

From source file:com.ikanow.aleph2.search_service.elasticsearch.hadoop.assets.Aleph2EsInputFormat.java

License:Apache License

public static List<InputSplit> getSplits(Function<JobContext, List<InputSplit>> get_splits,
        JobContext context) {
    final String[] indexes = context.getConfiguration().get(ALEPH2_RESOURCE, "").split(",,");

    return Arrays.stream(indexes).<InputSplit>flatMap(Lambdas.wrap_u(index -> {
        context.getConfiguration().set("es.resource.read", index.replace(" ", "%20"));
        return get_splits.apply(context).stream();
    })).collect(Collectors.<InputSplit>toList());
}

From source file:com.ikanow.aleph2.search_service.elasticsearch.hadoop.assets.TestAleph2EsInputFormat.java

License:Apache License

@Test
public void test_Aleph2EsInputFormat() {

    new Aleph2EsInputFormat(); //code coverage!

    final JobContext mock_context = Mockito.mock(JobContext.class);

    final Configuration test_config = new Configuration(false);

    test_config.set("aleph2.es.resource", "test1,,test ,2");

    Mockito.when(mock_context.getConfiguration()).thenReturn(test_config);

    final List<InputSplit> res = Aleph2EsInputFormat.getSplits(ctxt -> {
        final InputSplit split1 = Mockito.mock(InputSplit.class);
        final String es_res = ctxt.getConfiguration().get("es.resource.read").toString();

        Mockito.when(split1.toString()).thenReturn(es_res);
        return Arrays.asList(split1);
    }, mock_context);// w  ww .  j ava2  s .  c  om

    assertEquals(Arrays.asList("test1", "test%20,2"),
            res.stream().map(fmt -> fmt.toString()).collect(Collectors.toList()));
}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat.java

License:Apache License

@Override
public List getSplits(JobContext arg0) throws IOException, InterruptedException {

    LinkedList<InputSplit> fullList = new LinkedList<InputSplit>();

    String indexes[] = arg0.getConfiguration().get("es.resource").split("\\s*,,\\s*");
    for (String index : indexes) {
        _delegate = new EsInputFormat(); // create a new input format for each object

        arg0.getConfiguration().set("es.resource.read", index.replace(" ", "%20")); // (spaces in types cause problems)

        @SuppressWarnings("unchecked")
        List<InputSplit> list = _delegate.getSplits(arg0);
        if (LOCAL_DEBUG_MODE)
            enableInputSplitDebugMode(list);

        fullList.addAll(list);// w w  w . ja  v a 2s . c  o m
    }
    return fullList;
}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    // Test mode needs to restrict the docs also, otherwise things can get out of hand...
    int debugLimit = job.getConfiguration().getInt("mongo.input.limit", Integer.MAX_VALUE);
    if (debugLimit <= 0) { // (just not set)
        debugLimit = Integer.MAX_VALUE;
    }/*from   w  ww  . j  a v  a 2 s.  c om*/

    List<InputSplit> splits = super.getSplits(job);

    if (splits.size() > debugLimit) {
        splits = splits.subList(0, debugLimit);
    }
    return splits;
}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) {
    final Configuration hadoopConfiguration = context.getConfiguration();
    final InfiniteMongoConfig conf = new InfiniteMongoConfig(hadoopConfiguration);
    List<InputSplit> splits = InfiniteMongoSplitter.calculateSplits(conf);

    if (conf.getSelfMerge() != null) {
        //check if we need to grab existing records and add them to the splits
        final Configuration existingConfiguration = context.getConfiguration();
        existingConfiguration.set("mongo.input.uri", conf.getSelfMerge());
        BasicDBObject query = new BasicDBObject();
        //add on this query to only get items previous to now if no reducer is specified (otherwise
        //we will leak any items we map on the first run back in before this split runs)
        if (context.getNumReduceTasks() == 0)
            query.put("_id", new BasicDBObject(MongoDbManager.lt_, new ObjectId()));
        existingConfiguration.set("mongo.input.query", query.toString());
        final InfiniteMongoConfig existingConf = new InfiniteMongoConfig(existingConfiguration);
        splits.addAll(InfiniteMongoSplitter.calculateSplits(existingConf));
    }//from  www .ja  v a 2 s  .  c o m
    return splits;
}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteShareInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {

    // Test mode needs to restrict the docs also, otherwise things can get out of hand...
    int debugLimit = job.getConfiguration().getInt("mongo.input.limit", Integer.MAX_VALUE);
    if (debugLimit <= 0) { // (just not set)
        debugLimit = Integer.MAX_VALUE;
    }//from w w  w . j  a va  2 s.  co  m

    // Get the already authenticated list of share ids
    String oidStrs[] = job.getConfiguration().getStrings("mapred.input.dir");
    List<InputSplit> splits = new LinkedList<InputSplit>();
    int numSplits = 0;

    String sourceStr = job.getConfiguration().get("mongo.input.query");
    SourcePojo source = ApiManager.mapFromApi(sourceStr, SourcePojo.class, null);
    SourceFileConfigPojo fileConfig = source.getFileConfig();
    Pattern pathInclude = null;
    Pattern pathExclude = null;
    if (null != fileConfig.pathInclude) {
        pathInclude = Pattern.compile(fileConfig.pathInclude);
    }
    if (null != fileConfig.pathExclude) {
        pathExclude = Pattern.compile(fileConfig.pathExclude);
    }

    for (String oidStr : oidStrs) {
        try {
            BasicDBObject query = new BasicDBObject(SharePojo._id_, new ObjectId(oidStr));
            BasicDBObject fields = new BasicDBObject(SharePojo.binaryId_, 1);
            fields.put(SharePojo.title_, 1);
            SharePojo share = SharePojo.fromDb(DbManager.getSocial().getShare().findOne(query, fields),
                    SharePojo.class);

            if ((null != share) && (null != share.getBinaryId())) {
                GridFSRandomAccessFile file = new GridFSRandomAccessFile(
                        MongoDbManager.getSocial().getShareBinary(), share.getBinaryId());
                GridFSZipFile zipView = new GridFSZipFile(share.getTitle(), file);
                @SuppressWarnings("unchecked")
                Enumeration<net.sf.jazzlib.ZipEntry> entries = zipView.entries();
                while (entries.hasMoreElements()) {
                    net.sf.jazzlib.ZipEntry zipInfo = entries.nextElement();

                    if (zipInfo.isDirectory()) {
                        continue;
                    }

                    if (null != pathInclude) {
                        if (!pathInclude.matcher(zipInfo.getName()).matches()) {
                            continue;
                        }
                    }
                    if (null != pathExclude) {
                        if (pathExclude.matcher(zipInfo.getName()).matches()) {
                            continue;
                        }
                    }

                    InfiniteShareInputSplit split = new InfiniteShareInputSplit(share.get_id(),
                            share.getBinaryId(), zipInfo.getName(), zipInfo.getSize(), zipInfo.getTime());
                    splits.add(split);
                    if (++numSplits >= debugLimit) {
                        break;
                    }
                    //DEBUG
                    //System.out.println("ADD NEW SPLIT: " + share.get_id() + " , " + share.getBinaryId() + " , " + zipInfo.getName() + " , " + zipInfo.getSize());
                }
            }
        } catch (Exception e) {
        } // (this would be an internal logic error
    }
    return splits;
}

From source file:com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) {
    final Configuration hadoopConfiguration = context.getConfiguration();
    final InfiniteMongoConfig conf = new InfiniteMongoConfig(hadoopConfiguration);
    String queryStr = hadoopConfiguration.get("mongo.input.query");
    String userIdStr = hadoopConfiguration.get("infinit.e.userid");
    AdvancedQueryPojo query = AdvancedQueryPojo.fromApi(queryStr, AdvancedQueryPojo.class);
    return calculateSplits(query, userIdStr, conf);
}

From source file:com.inmobi.conduit.distcp.tools.mapred.lib.TestDynamicInputFormat.java

License:Apache License

@Test
public void testGetSplits() throws Exception {
    DistCpOptions options = getOptions();
    Configuration configuration = new Configuration();
    configuration.set("mapred.map.tasks", String.valueOf(options.getMaxMaps()));
    CopyListing.getCopyListing(configuration, CREDENTIALS, options).buildListing(
            new Path(cluster.getFileSystem().getUri().toString() + "/tmp/testDynInputFormat/fileList.seq"),
            options);/*  w ww . j  a v  a2s.  c  o  m*/

    JobID jobId = new JobID();
    JobContext jobContext = mock(JobContext.class);
    when(jobContext.getConfiguration()).thenReturn(configuration);
    when(jobContext.getJobID()).thenReturn(jobId);
    DynamicInputFormat<Text, FileStatus> inputFormat = new DynamicInputFormat<Text, FileStatus>();
    List<InputSplit> splits = inputFormat.getSplits(jobContext);

    int nFiles = 0;
    int taskId = 0;

    for (InputSplit split : splits) {
        TaskAttemptID tId = new TaskAttemptID("", 0, true, taskId, 0);
        final TaskAttemptContext taskAttemptContext = mock(TaskAttemptContext.class);
        when(taskAttemptContext.getConfiguration()).thenReturn(configuration);
        when(taskAttemptContext.getTaskAttemptID()).thenReturn(tId);
        RecordReader<Text, FileStatus> recordReader = inputFormat.createRecordReader(split, taskAttemptContext);
        recordReader.initialize(splits.get(0), taskAttemptContext);
        float previousProgressValue = 0f;
        while (recordReader.nextKeyValue()) {
            FileStatus fileStatus = recordReader.getCurrentValue();
            String source = fileStatus.getPath().toString();
            System.out.println(source);
            Assert.assertTrue(expectedFilePaths.contains(source));
            final float progress = recordReader.getProgress();
            Assert.assertTrue(progress >= previousProgressValue);
            Assert.assertTrue(progress >= 0.0f);
            Assert.assertTrue(progress <= 1.0f);
            previousProgressValue = progress;
            ++nFiles;
        }
        Assert.assertTrue(recordReader.getProgress() == 1.0f);

        ++taskId;
    }

    Assert.assertEquals(expectedFilePaths.size(), nFiles);
}