Example usage for org.apache.hadoop.mapreduce JobContext getConfiguration

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce JobContext getConfiguration.

Prototype

public Configuration getConfiguration();

Source Link

Document

Return the configuration for the job.

Usage

From source file:com.ikanow.aleph2.analytics.r.assets.BeFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    logger.debug("BeFileInputFormat.getSplits");

    super.setMaxSplitSize(MAX_SPLIT_SIZE);

    try {//from w  ww .  j a va  2 s.  c  om
        final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> {
            final List<InputSplit> tmp = super.getSplits(context);

            String debug_max_str = context.getConfiguration().get(BatchEnrichmentJob.BE_DEBUG_MAX_SIZE);
            if (null != debug_max_str) {
                final int requested_records = Integer.parseInt(debug_max_str);

                // dump 5* the request number of splits into one mega split
                // to strike a balance between limiting the data and making sure for 
                // tests that enough records are generated

                final CombineFileSplit combined = new CombineFileSplit(
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records)
                                .<Path>toArray(size -> new Path[size]),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getLengths()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations())))
                                .limit(5L * requested_records).<String>toArray(size -> new String[size]));
                return Arrays.<InputSplit>asList(combined);
            } else
                return tmp;
        }));

        logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null"));
        return splits;

    } catch (Throwable t) {
        logger.error(ErrorUtils.getLongForm("Error getting splits, error = {0}", t));

        return Collections.emptyList();
    }
}

From source file:com.ikanow.aleph2.analytics.spark.assets.BeFileInputFormat_Pure.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException {
    logger.debug("BeFileInputFormat.getSplits");

    super.setMaxSplitSize(MAX_SPLIT_SIZE);

    try {/*from   w  w w. jav  a 2  s . c  o m*/
        final List<InputSplit> splits = Lambdas.get(Lambdas.wrap_u(() -> {
            final List<InputSplit> tmp = super.getSplits(context);

            String debug_max_str = context.getConfiguration().get(HadoopBatchEnrichmentUtils.BE_DEBUG_MAX_SIZE);
            if (null != debug_max_str) {
                final int requested_records = Integer.parseInt(debug_max_str);

                // dump 5* the request number of splits into one mega split
                // to strike a balance between limiting the data and making sure for 
                // tests that enough records are generated

                final CombineFileSplit combined = new CombineFileSplit(
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(split -> Arrays.stream(split.getPaths())).limit(5L * requested_records)
                                .<Path>toArray(size -> new Path[size]),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getStartOffsets()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        ArrayUtils.toPrimitive(
                                tmp.stream().map(split -> (CombineFileSplit) split)
                                        .flatMap(split -> Arrays.stream(split.getLengths()).boxed())
                                        .limit(5L * requested_records).<Long>toArray(size -> new Long[size]),
                                0L),
                        tmp.stream().map(split -> (CombineFileSplit) split)
                                .flatMap(Lambdas.wrap_u(split -> Arrays.stream(split.getLocations())))
                                .limit(5L * requested_records).<String>toArray(size -> new String[size]));
                return Arrays.<InputSplit>asList(combined);
            } else
                return tmp;
        }));

        logger.debug("BeFileInputFormat.getSplits: " + ((splits != null) ? splits.size() : "null"));
        return splits;

    } catch (Throwable t) {
        logger.error(ErrorUtils.getLongForm("Error getting splits, error = {0}", t));

        return Collections.emptyList();
    }
}

From source file:com.ikanow.aleph2.search_service.elasticsearch.hadoop.assets.Aleph2EsInputFormat.java

License:Apache License

public static List<InputSplit> getSplits(Function<JobContext, List<InputSplit>> get_splits,
        JobContext context) {
    final String[] indexes = context.getConfiguration().get(ALEPH2_RESOURCE, "").split(",,");

    return Arrays.stream(indexes).<InputSplit>flatMap(Lambdas.wrap_u(index -> {
        context.getConfiguration().set("es.resource.read", index.replace(" ", "%20"));
        return get_splits.apply(context).stream();
    })).collect(Collectors.<InputSplit>toList());
}

From source file:com.ikanow.aleph2.search_service.elasticsearch.hadoop.assets.TestAleph2EsInputFormat.java

License:Apache License

@Test
public void test_Aleph2EsInputFormat() {

    new Aleph2EsInputFormat(); //code coverage!

    final JobContext mock_context = Mockito.mock(JobContext.class);

    final Configuration test_config = new Configuration(false);

    test_config.set("aleph2.es.resource", "test1,,test ,2");

    Mockito.when(mock_context.getConfiguration()).thenReturn(test_config);

    final List<InputSplit> res = Aleph2EsInputFormat.getSplits(ctxt -> {
        final InputSplit split1 = Mockito.mock(InputSplit.class);
        final String es_res = ctxt.getConfiguration().get("es.resource.read").toString();

        Mockito.when(split1.toString()).thenReturn(es_res);
        return Arrays.asList(split1);
    }, mock_context);// w  ww .  j ava2  s .  c  om

    assertEquals(Arrays.asList("test1", "test%20,2"),
            res.stream().map(fmt -> fmt.toString()).collect(Collectors.toList()));
}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteEsInputFormat.java

License:Apache License

@Override
public List getSplits(JobContext arg0) throws IOException, InterruptedException {

    LinkedList<InputSplit> fullList = new LinkedList<InputSplit>();

    String indexes[] = arg0.getConfiguration().get("es.resource").split("\\s*,,\\s*");
    for (String index : indexes) {
        _delegate = new EsInputFormat(); // create a new input format for each object

        arg0.getConfiguration().set("es.resource.read", index.replace(" ", "%20")); // (spaces in types cause problems)

        @SuppressWarnings("unchecked")
        List<InputSplit> list = _delegate.getSplits(arg0);
        if (LOCAL_DEBUG_MODE)
            enableInputSplitDebugMode(list);

        fullList.addAll(list);// w w  w . ja  v a 2s . c  o m
    }
    return fullList;
}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteFileInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {

    // Test mode needs to restrict the docs also, otherwise things can get out of hand...
    int debugLimit = job.getConfiguration().getInt("mongo.input.limit", Integer.MAX_VALUE);
    if (debugLimit <= 0) { // (just not set)
        debugLimit = Integer.MAX_VALUE;
    }/*from   w  ww  . j  a v  a 2 s.  c om*/

    List<InputSplit> splits = super.getSplits(job);

    if (splits.size() > debugLimit) {
        splits = splits.subList(0, debugLimit);
    }
    return splits;
}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteMongoInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) {
    final Configuration hadoopConfiguration = context.getConfiguration();
    final InfiniteMongoConfig conf = new InfiniteMongoConfig(hadoopConfiguration);
    List<InputSplit> splits = InfiniteMongoSplitter.calculateSplits(conf);

    if (conf.getSelfMerge() != null) {
        //check if we need to grab existing records and add them to the splits
        final Configuration existingConfiguration = context.getConfiguration();
        existingConfiguration.set("mongo.input.uri", conf.getSelfMerge());
        BasicDBObject query = new BasicDBObject();
        //add on this query to only get items previous to now if no reducer is specified (otherwise
        //we will leak any items we map on the first run back in before this split runs)
        if (context.getNumReduceTasks() == 0)
            query.put("_id", new BasicDBObject(MongoDbManager.lt_, new ObjectId()));
        existingConfiguration.set("mongo.input.query", query.toString());
        final InfiniteMongoConfig existingConf = new InfiniteMongoConfig(existingConfiguration);
        splits.addAll(InfiniteMongoSplitter.calculateSplits(existingConf));
    }//from  www .ja  v a 2 s  .  c o m
    return splits;
}

From source file:com.ikanow.infinit.e.data_model.custom.InfiniteShareInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException {

    // Test mode needs to restrict the docs also, otherwise things can get out of hand...
    int debugLimit = job.getConfiguration().getInt("mongo.input.limit", Integer.MAX_VALUE);
    if (debugLimit <= 0) { // (just not set)
        debugLimit = Integer.MAX_VALUE;
    }//from w w  w . j  a va  2 s.  co  m

    // Get the already authenticated list of share ids
    String oidStrs[] = job.getConfiguration().getStrings("mapred.input.dir");
    List<InputSplit> splits = new LinkedList<InputSplit>();
    int numSplits = 0;

    String sourceStr = job.getConfiguration().get("mongo.input.query");
    SourcePojo source = ApiManager.mapFromApi(sourceStr, SourcePojo.class, null);
    SourceFileConfigPojo fileConfig = source.getFileConfig();
    Pattern pathInclude = null;
    Pattern pathExclude = null;
    if (null != fileConfig.pathInclude) {
        pathInclude = Pattern.compile(fileConfig.pathInclude);
    }
    if (null != fileConfig.pathExclude) {
        pathExclude = Pattern.compile(fileConfig.pathExclude);
    }

    for (String oidStr : oidStrs) {
        try {
            BasicDBObject query = new BasicDBObject(SharePojo._id_, new ObjectId(oidStr));
            BasicDBObject fields = new BasicDBObject(SharePojo.binaryId_, 1);
            fields.put(SharePojo.title_, 1);
            SharePojo share = SharePojo.fromDb(DbManager.getSocial().getShare().findOne(query, fields),
                    SharePojo.class);

            if ((null != share) && (null != share.getBinaryId())) {
                GridFSRandomAccessFile file = new GridFSRandomAccessFile(
                        MongoDbManager.getSocial().getShareBinary(), share.getBinaryId());
                GridFSZipFile zipView = new GridFSZipFile(share.getTitle(), file);
                @SuppressWarnings("unchecked")
                Enumeration<net.sf.jazzlib.ZipEntry> entries = zipView.entries();
                while (entries.hasMoreElements()) {
                    net.sf.jazzlib.ZipEntry zipInfo = entries.nextElement();

                    if (zipInfo.isDirectory()) {
                        continue;
                    }

                    if (null != pathInclude) {
                        if (!pathInclude.matcher(zipInfo.getName()).matches()) {
                            continue;
                        }
                    }
                    if (null != pathExclude) {
                        if (pathExclude.matcher(zipInfo.getName()).matches()) {
                            continue;
                        }
                    }

                    InfiniteShareInputSplit split = new InfiniteShareInputSplit(share.get_id(),
                            share.getBinaryId(), zipInfo.getName(), zipInfo.getSize(), zipInfo.getTime());
                    splits.add(split);
                    if (++numSplits >= debugLimit) {
                        break;
                    }
                    //DEBUG
                    //System.out.println("ADD NEW SPLIT: " + share.get_id() + " , " + share.getBinaryId() + " , " + zipInfo.getName() + " , " + zipInfo.getSize());
                }
            }
        } catch (Exception e) {
        } // (this would be an internal logic error
    }
    return splits;
}

From source file:com.ikanow.infinit.e.processing.custom.utils.InfiniteElasticsearchMongoInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext context) {
    final Configuration hadoopConfiguration = context.getConfiguration();
    final InfiniteMongoConfig conf = new InfiniteMongoConfig(hadoopConfiguration);
    String queryStr = hadoopConfiguration.get("mongo.input.query");
    String userIdStr = hadoopConfiguration.get("infinit.e.userid");
    AdvancedQueryPojo query = AdvancedQueryPojo.fromApi(queryStr, AdvancedQueryPojo.class);
    return calculateSplits(query, userIdStr, conf);
}

From source file:com.inmobi.conduit.distcp.tools.mapred.lib.TestDynamicInputFormat.java

License:Apache License

@Test
public void testGetSplits() throws Exception {
    DistCpOptions options = getOptions();
    Configuration configuration = new Configuration();
    configuration.set("mapred.map.tasks", String.valueOf(options.getMaxMaps()));
    CopyListing.getCopyListing(configuration, CREDENTIALS, options).buildListing(
            new Path(cluster.getFileSystem().getUri().toString() + "/tmp/testDynInputFormat/fileList.seq"),
            options);/*  w ww . j  a v  a2s.  c  o  m*/

    JobID jobId = new JobID();
    JobContext jobContext = mock(JobContext.class);
    when(jobContext.getConfiguration()).thenReturn(configuration);
    when(jobContext.getJobID()).thenReturn(jobId);
    DynamicInputFormat<Text, FileStatus> inputFormat = new DynamicInputFormat<Text, FileStatus>();
    List<InputSplit> splits = inputFormat.getSplits(jobContext);

    int nFiles = 0;
    int taskId = 0;

    for (InputSplit split : splits) {
        TaskAttemptID tId = new TaskAttemptID("", 0, true, taskId, 0);
        final TaskAttemptContext taskAttemptContext = mock(TaskAttemptContext.class);
        when(taskAttemptContext.getConfiguration()).thenReturn(configuration);
        when(taskAttemptContext.getTaskAttemptID()).thenReturn(tId);
        RecordReader<Text, FileStatus> recordReader = inputFormat.createRecordReader(split, taskAttemptContext);
        recordReader.initialize(splits.get(0), taskAttemptContext);
        float previousProgressValue = 0f;
        while (recordReader.nextKeyValue()) {
            FileStatus fileStatus = recordReader.getCurrentValue();
            String source = fileStatus.getPath().toString();
            System.out.println(source);
            Assert.assertTrue(expectedFilePaths.contains(source));
            final float progress = recordReader.getProgress();
            Assert.assertTrue(progress >= previousProgressValue);
            Assert.assertTrue(progress >= 0.0f);
            Assert.assertTrue(progress <= 1.0f);
            previousProgressValue = progress;
            ++nFiles;
        }
        Assert.assertTrue(recordReader.getProgress() == 1.0f);

        ++taskId;
    }

    Assert.assertEquals(expectedFilePaths.size(), nFiles);
}