Example usage for org.apache.hadoop.mapred JobConf setBoolean

Introduction

In this page you can find the example usage for org.apache.hadoop.mapred JobConf setBoolean.

Prototype

public void setBoolean(String name, boolean value)

Source Link

Document

Set the value of the name property to a boolean.

Usage

From source file:org.apache.pig.backend.hadoop.executionengine.tez.TezDagBuilder.java

License:Apache License

private Vertex newVertex(TezOperator tezOp, boolean isMap)
        throws IOException, ClassNotFoundException, InterruptedException {
    ProcessorDescriptor procDesc = ProcessorDescriptor.create(tezOp.getProcessorName());

    // Pass physical plans to vertex as user payload.
    JobConf payloadConf = new JobConf(ConfigurationUtil.toConfiguration(pc.getProperties(), false));

    // We do this so that dag.getCredentials(), job.getCredentials(),
    // job.getConfiguration().getCredentials() all reference the same Credentials object
    // Unfortunately there is no setCredentials() on Job
    payloadConf.setCredentials(dag.getCredentials());
    // We won't actually use this job, but we need it to talk with the Load Store funcs
    @SuppressWarnings("deprecation")
    Job job = new Job(payloadConf);
    payloadConf = (JobConf) job.getConfiguration();

    if (tezOp.sampleOperator != null) {
        payloadConf.set(PigProcessor.SAMPLE_VERTEX, tezOp.sampleOperator.getOperatorKey().toString());
    }/*w ww  . j  a v a  2s  . c  om*/

    if (tezOp.sortOperator != null) {
        payloadConf.set(PigProcessor.SORT_VERTEX, tezOp.sortOperator.getOperatorKey().toString());
    }

    String tmp;
    long maxCombinedSplitSize = 0;
    if (!tezOp.combineSmallSplits()
            || pc.getProperties().getProperty(PigConfiguration.PIG_SPLIT_COMBINATION, "true").equals("false"))
        payloadConf.setBoolean(PigConfiguration.PIG_NO_SPLIT_COMBINATION, true);
    else if ((tmp = pc.getProperties().getProperty(PigConfiguration.PIG_MAX_COMBINED_SPLIT_SIZE,
            null)) != null) {
        try {
            maxCombinedSplitSize = Long.parseLong(tmp);
        } catch (NumberFormatException e) {
            log.warn(
                    "Invalid numeric format for pig.maxCombinedSplitSize; use the default maximum combined split size");
        }
    }
    if (maxCombinedSplitSize > 0)
        payloadConf.setLong("pig.maxCombinedSplitSize", maxCombinedSplitSize);

    payloadConf.set("pig.inputs", ObjectSerializer.serialize(tezOp.getLoaderInfo().getInp()));
    payloadConf.set("pig.inpSignatures",
            ObjectSerializer.serialize(tezOp.getLoaderInfo().getInpSignatureLists()));
    payloadConf.set("pig.inpLimits", ObjectSerializer.serialize(tezOp.getLoaderInfo().getInpLimits()));
    // Process stores
    LinkedList<POStore> stores = processStores(tezOp, payloadConf, job);

    payloadConf.set("pig.pigContext", ObjectSerializer.serialize(pc));
    payloadConf.set("udf.import.list", ObjectSerializer.serialize(PigContext.getPackageImportList()));
    payloadConf.set("exectype", "TEZ");
    payloadConf.setBoolean(MRConfiguration.MAPPER_NEW_API, true);
    payloadConf.setClass(MRConfiguration.INPUTFORMAT_CLASS, PigInputFormat.class, InputFormat.class);

    // Set parent plan for all operators in the Tez plan.
    new PhyPlanSetter(tezOp.plan).visit();

    // Set the endOfAllInput flag on the physical plan if certain operators that
    // use this property (such as STREAM) are present in the plan.
    EndOfAllInputSetter.EndOfAllInputChecker checker = new EndOfAllInputSetter.EndOfAllInputChecker(tezOp.plan);
    checker.visit();
    if (checker.isEndOfAllInputPresent()) {
        payloadConf.set(JobControlCompiler.END_OF_INP_IN_MAP, "true");
    }

    // Configure the classes for incoming shuffles to this TezOp
    // TODO: Refactor out resetting input keys, PIG-3957
    List<PhysicalOperator> roots = tezOp.plan.getRoots();
    if (roots.size() == 1 && roots.get(0) instanceof POPackage) {
        POPackage pack = (POPackage) roots.get(0);

        List<PhysicalOperator> succsList = tezOp.plan.getSuccessors(pack);
        if (succsList != null) {
            succsList = new ArrayList<PhysicalOperator>(succsList);
        }
        byte keyType = pack.getPkgr().getKeyType();
        tezOp.plan.remove(pack);
        payloadConf.set("pig.reduce.package", ObjectSerializer.serialize(pack));
        setIntermediateOutputKeyValue(keyType, payloadConf, tezOp);
        POShuffleTezLoad newPack;
        newPack = new POShuffleTezLoad(pack);
        if (tezOp.isSkewedJoin()) {
            newPack.setSkewedJoins(true);
        }
        tezOp.plan.add(newPack);

        // Set input keys for POShuffleTezLoad. This is used to identify
        // the inputs that are attached to the POShuffleTezLoad in the
        // backend.
        Map<Integer, String> localRearrangeMap = new TreeMap<Integer, String>();
        for (TezOperator pred : mPlan.getPredecessors(tezOp)) {
            if (tezOp.sampleOperator != null && tezOp.sampleOperator == pred) {
                // skip sample vertex input
            } else {
                String inputKey = pred.getOperatorKey().toString();
                if (pred.isVertexGroup()) {
                    pred = mPlan.getOperator(pred.getVertexGroupMembers().get(0));
                }
                LinkedList<POLocalRearrangeTez> lrs = PlanHelper.getPhysicalOperators(pred.plan,
                        POLocalRearrangeTez.class);
                for (POLocalRearrangeTez lr : lrs) {
                    if (lr.isConnectedToPackage()
                            && lr.getOutputKey().equals(tezOp.getOperatorKey().toString())) {
                        localRearrangeMap.put((int) lr.getIndex(), inputKey);
                    }
                }
            }
        }
        for (Map.Entry<Integer, String> entry : localRearrangeMap.entrySet()) {
            newPack.addInputKey(entry.getValue());
        }

        if (succsList != null) {
            for (PhysicalOperator succs : succsList) {
                tezOp.plan.connect(newPack, succs);
            }
        }

        setIntermediateOutputKeyValue(pack.getPkgr().getKeyType(), payloadConf, tezOp);
    } else if (roots.size() == 1 && roots.get(0) instanceof POIdentityInOutTez) {
        POIdentityInOutTez identityInOut = (POIdentityInOutTez) roots.get(0);
        // TODO Need to fix multiple input key mapping
        TezOperator identityInOutPred = null;
        for (TezOperator pred : mPlan.getPredecessors(tezOp)) {
            if (!pred.isSampleAggregation()) {
                identityInOutPred = pred;
                break;
            }
        }
        identityInOut.setInputKey(identityInOutPred.getOperatorKey().toString());
    } else if (roots.size() == 1 && roots.get(0) instanceof POValueInputTez) {
        POValueInputTez valueInput = (POValueInputTez) roots.get(0);

        LinkedList<String> scalarInputs = new LinkedList<String>();
        for (POUserFunc userFunc : PlanHelper.getPhysicalOperators(tezOp.plan, POUserFunc.class)) {
            if (userFunc.getFunc() instanceof ReadScalarsTez) {
                scalarInputs.add(((ReadScalarsTez) userFunc.getFunc()).getTezInputs()[0]);
            }
        }
        // Make sure we don't find the scalar
        for (TezOperator pred : mPlan.getPredecessors(tezOp)) {
            if (!scalarInputs.contains(pred.getOperatorKey().toString())) {
                valueInput.setInputKey(pred.getOperatorKey().toString());
                break;
            }
        }
    }
    JobControlCompiler.setOutputFormat(job);

    // set parent plan in all operators. currently the parent plan is really
    // used only when POStream, POSplit are present in the plan
    new PhyPlanSetter(tezOp.plan).visit();

    // Serialize the execution plan
    payloadConf.set(PigProcessor.PLAN, ObjectSerializer.serialize(tezOp.plan));

    UDFContext.getUDFContext().serialize(payloadConf);

    MRToTezHelper.processMRSettings(payloadConf, globalConf);

    if (!pc.inIllustrator) {
        for (POStore store : stores) {
            // unset inputs for POStore, otherwise, map/reduce plan will be unnecessarily deserialized
            store.setInputs(null);
            store.setParentPlan(null);
        }
        // We put them in the reduce because PigOutputCommitter checks the
        // ID of the task to see if it's a map, and if not, calls the reduce
        // committers.
        payloadConf.set(JobControlCompiler.PIG_MAP_STORES,
                ObjectSerializer.serialize(new ArrayList<POStore>()));
        payloadConf.set(JobControlCompiler.PIG_REDUCE_STORES, ObjectSerializer.serialize(stores));
    }

    if (tezOp.isNeedEstimateParallelism()) {
        payloadConf.setBoolean(PigProcessor.ESTIMATE_PARALLELISM, true);
        log.info("Estimate quantile for sample aggregation vertex " + tezOp.getOperatorKey().toString());
    }

    // Take our assembled configuration and create a vertex
    UserPayload userPayload = TezUtils.createUserPayloadFromConf(payloadConf);
    procDesc.setUserPayload(userPayload);

    Vertex vertex = Vertex.create(tezOp.getOperatorKey().toString(), procDesc, tezOp.getVertexParallelism(),
            isMap ? MRHelpers.getResourceForMRMapper(globalConf)
                    : MRHelpers.getResourceForMRReducer(globalConf));

    Map<String, String> taskEnv = new HashMap<String, String>();
    MRHelpers.updateEnvBasedOnMRTaskEnv(globalConf, taskEnv, isMap);
    vertex.setTaskEnvironment(taskEnv);

    // All these classes are @InterfaceAudience.Private in Hadoop. Switch to Tez methods in TEZ-1012
    // set the timestamps, public/private visibility of the archives and files
    ClientDistributedCacheManager.determineTimestampsAndCacheVisibilities(globalConf);
    // get DelegationToken for each cached file
    ClientDistributedCacheManager.getDelegationTokens(globalConf, job.getCredentials());
    MRApps.setupDistributedCache(globalConf, localResources);
    vertex.addTaskLocalFiles(localResources);

    vertex.setTaskLaunchCmdOpts(isMap ? MRHelpers.getJavaOptsForMRMapper(globalConf)
            : MRHelpers.getJavaOptsForMRReducer(globalConf));

    log.info("For vertex - " + tezOp.getOperatorKey().toString() + ": parallelism="
            + tezOp.getVertexParallelism() + ", memory=" + vertex.getTaskResource().getMemory() + ", java opts="
            + vertex.getTaskLaunchCmdOpts());

    // Right now there can only be one of each of these. Will need to be
    // more generic when there can be more.
    for (POLoad ld : tezOp.getLoaderInfo().getLoads()) {

        // TODO: These should get the globalConf, or a merged version that
        // keeps settings like pig.maxCombinedSplitSize
        vertex.setLocationHint(
                VertexLocationHint.create(tezOp.getLoaderInfo().getInputSplitInfo().getTaskLocationHints()));
        vertex.addDataSource(ld.getOperatorKey().toString(), DataSourceDescriptor.create(
                InputDescriptor.create(MRInput.class.getName())
                        .setUserPayload(UserPayload.create(MRRuntimeProtos.MRInputUserPayloadProto.newBuilder()
                                .setConfigurationBytes(TezUtils.createByteStringFromConf(payloadConf))
                                .setSplits(tezOp.getLoaderInfo().getInputSplitInfo().getSplitsProto()).build()
                                .toByteString().asReadOnlyByteBuffer())),
                InputInitializerDescriptor.create(MRInputSplitDistributor.class.getName()),
                dag.getCredentials()));
    }

    for (POStore store : stores) {

        ArrayList<POStore> emptyList = new ArrayList<POStore>();
        ArrayList<POStore> singleStore = new ArrayList<POStore>();
        singleStore.add(store);

        Configuration outputPayLoad = new Configuration(payloadConf);
        outputPayLoad.set(JobControlCompiler.PIG_MAP_STORES, ObjectSerializer.serialize(emptyList));
        outputPayLoad.set(JobControlCompiler.PIG_REDUCE_STORES, ObjectSerializer.serialize(singleStore));

        OutputDescriptor storeOutDescriptor = OutputDescriptor.create(MROutput.class.getName())
                .setUserPayload(TezUtils.createUserPayloadFromConf(outputPayLoad));
        if (tezOp.getVertexGroupStores() != null) {
            OperatorKey vertexGroupKey = tezOp.getVertexGroupStores().get(store.getOperatorKey());
            if (vertexGroupKey != null) {
                getPlan().getOperator(vertexGroupKey).getVertexGroupInfo()
                        .setStoreOutputDescriptor(storeOutDescriptor);
                continue;
            }
        }
        vertex.addDataSink(store.getOperatorKey().toString(), new DataSinkDescriptor(storeOutDescriptor,
                OutputCommitterDescriptor.create(MROutputCommitter.class.getName()), dag.getCredentials()));
    }

    // LoadFunc and StoreFunc add delegation tokens to Job Credentials in
    // setLocation and setStoreLocation respectively. For eg: HBaseStorage
    // InputFormat add delegation token in getSplits and OutputFormat in
    // checkOutputSpecs. For eg: FileInputFormat and FileOutputFormat
    if (stores.size() > 0) {
        new PigOutputFormat().checkOutputSpecs(job);
    }

    // Set the right VertexManagerPlugin
    if (tezOp.getEstimatedParallelism() != -1) {
        if (tezOp.isGlobalSort() || tezOp.isSkewedJoin()) {
            // Set VertexManagerPlugin to PartitionerDefinedVertexManager, which is able
            // to decrease/increase parallelism of sorting vertex dynamically
            // based on the numQuantiles calculated by sample aggregation vertex
            vertex.setVertexManagerPlugin(
                    VertexManagerPluginDescriptor.create(PartitionerDefinedVertexManager.class.getName()));
            log.info("Set VertexManagerPlugin to PartitionerDefinedParallelismVertexManager for vertex "
                    + tezOp.getOperatorKey().toString());
        } else {
            boolean containScatterGather = false;
            boolean containCustomPartitioner = false;
            for (TezEdgeDescriptor edge : tezOp.inEdges.values()) {
                if (edge.dataMovementType == DataMovementType.SCATTER_GATHER) {
                    containScatterGather = true;
                }
                if (edge.partitionerClass != null) {
                    containCustomPartitioner = true;
                }
            }
            if (containScatterGather && !containCustomPartitioner) {
                // Use auto-parallelism feature of ShuffleVertexManager to dynamically
                // reduce the parallelism of the vertex
                VertexManagerPluginDescriptor vmPluginDescriptor = VertexManagerPluginDescriptor
                        .create(ShuffleVertexManager.class.getName());
                Configuration vmPluginConf = ConfigurationUtil.toConfiguration(pc.getProperties(), false);
                vmPluginConf.setBoolean(ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_ENABLE_AUTO_PARALLEL,
                        true);
                if (vmPluginConf.getLong(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
                        InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER) != InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER) {
                    vmPluginConf.setLong(
                            ShuffleVertexManager.TEZ_SHUFFLE_VERTEX_MANAGER_DESIRED_TASK_INPUT_SIZE,
                            vmPluginConf.getLong(InputSizeReducerEstimator.BYTES_PER_REDUCER_PARAM,
                                    InputSizeReducerEstimator.DEFAULT_BYTES_PER_REDUCER));
                }
                vmPluginDescriptor.setUserPayload(TezUtils.createUserPayloadFromConf(vmPluginConf));
                vertex.setVertexManagerPlugin(vmPluginDescriptor);
                log.info("Set auto parallelism for vertex " + tezOp.getOperatorKey().toString());
            }
        }
    }

    // Reset udfcontext jobconf. It is not supposed to be set in the front end
    UDFContext.getUDFContext().addJobConf(null);
    return vertex;
}

From source file:org.apache.solr.hadoop.MorphlineBasicMiniMRTest.java

License:Apache License

@Test
public void mrRun() throws Exception {
    FileSystem fs = dfsCluster.getFileSystem();
    Path inDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/input"));
    fs.delete(inDir, true);/*from  w  ww  .j  a  va2s . c om*/
    String DATADIR = "/user/testing/testMapperReducer/data";
    Path dataDir = fs.makeQualified(new Path(DATADIR));
    fs.delete(dataDir, true);
    Path outDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/output"));
    fs.delete(outDir, true);

    assertTrue(fs.mkdirs(inDir));
    Path INPATH = new Path(inDir, "input.txt");
    OutputStream os = fs.create(INPATH);
    Writer wr = new OutputStreamWriter(os, "UTF-8");
    wr.write(DATADIR + "/" + inputAvroFile);
    wr.close();

    assertTrue(fs.mkdirs(dataDir));
    fs.copyFromLocalFile(new Path(DOCUMENTS_DIR, inputAvroFile), dataDir);

    JobConf jobConf = getJobConf();
    if (ENABLE_LOCAL_JOB_RUNNER) { // enable Hadoop LocalJobRunner; this enables to run in debugger and set breakpoints
        jobConf.set("mapred.job.tracker", "local");
    }
    jobConf.setMaxMapAttempts(1);
    jobConf.setMaxReduceAttempts(1);
    jobConf.setJar(SEARCH_ARCHIVES_JAR);
    jobConf.setBoolean(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);

    int shards = 2;
    int maxReducers = Integer.MAX_VALUE;
    if (ENABLE_LOCAL_JOB_RUNNER) {
        // local job runner has a couple of limitations: only one reducer is supported and the DistributedCache doesn't work.
        // see http://blog.cloudera.com/blog/2009/07/advice-on-qa-testing-your-mapreduce-jobs/
        maxReducers = 1;
        shards = 1;
    }

    String[] args = new String[] {
            "--morphline-file=" + RESOURCES_DIR + "/test-morphlines/solrCellDocumentTypes.conf",
            "--morphline-id=morphline1", "--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
            "--output-dir=" + outDir.toString(), "--shards=" + shards, "--verbose",
            numRuns % 2 == 0 ? "--input-list=" + INPATH.toString() : dataDir.toString(),
            numRuns % 3 == 0 ? "--reducers=" + shards
                    : (numRuns % 3 == 1 ? "--reducers=-1" : "--reducers=" + Math.min(8, maxReducers)) };
    if (numRuns % 3 == 2) {
        args = concat(args, new String[] { "--fanout=2" });
    }
    if (numRuns == 0) {
        // force (slow) MapReduce based randomization to get coverage for that as well
        args = concat(new String[] { "-D", MapReduceIndexerTool.MAIN_MEMORY_RANDOMIZATION_THRESHOLD + "=-1" },
                args);
    }
    MapReduceIndexerTool tool = createTool();
    int res = ToolRunner.run(jobConf, tool, args);
    assertEquals(0, res);
    Job job = tool.job;
    assertTrue(job.isComplete());
    assertTrue(job.isSuccessful());

    if (numRuns % 3 != 2) {
        // Only run this check if mtree merge is disabled.
        // With mtree merge enabled the BatchWriter counters aren't available anymore because 
        // variable "job" now refers to the merge job rather than the indexing job
        assertEquals(
                "Invalid counter " + SolrRecordWriter.class.getName() + "." + SolrCounters.DOCUMENTS_WRITTEN,
                count,
                job.getCounters()
                        .findCounter(SolrCounters.class.getName(), SolrCounters.DOCUMENTS_WRITTEN.toString())
                        .getValue());
    }

    // Check the output is as expected
    outDir = new Path(outDir, MapReduceIndexerTool.RESULTS_DIR);
    Path[] outputFiles = FileUtil.stat2Paths(fs.listStatus(outDir));

    System.out.println("outputfiles:" + Arrays.toString(outputFiles));

    TestUtils.validateSolrServerDocumentCount(MINIMR_CONF_DIR, fs, outDir, count, shards);

    // run again with --dryrun mode:  
    tool = createTool();
    args = concat(args, new String[] { "--dry-run" });
    res = ToolRunner.run(jobConf, tool, args);
    assertEquals(0, res);

    numRuns++;
}

From source file:org.apache.solr.hadoop.MorphlineGoLiveMiniMRTest.java

License:Apache License

@Override
public void doTest() throws Exception {

    waitForRecoveriesToFinish(false);//w  w w. j ava2 s . co  m

    FileSystem fs = dfsCluster.getFileSystem();
    Path inDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/input"));
    fs.delete(inDir, true);
    String DATADIR = "/user/testing/testMapperReducer/data";
    Path dataDir = fs.makeQualified(new Path(DATADIR));
    fs.delete(dataDir, true);
    Path outDir = fs.makeQualified(new Path("/user/testing/testMapperReducer/output"));
    fs.delete(outDir, true);

    assertTrue(fs.mkdirs(inDir));
    Path INPATH = upAvroFile(fs, inDir, DATADIR, dataDir, inputAvroFile1);

    JobConf jobConf = getJobConf();
    // enable mapred.job.tracker = local to run in debugger and set breakpoints
    // jobConf.set("mapred.job.tracker", "local");
    jobConf.setMaxMapAttempts(1);
    jobConf.setMaxReduceAttempts(1);
    jobConf.setJar(SEARCH_ARCHIVES_JAR);
    jobConf.setBoolean(ExtractingParams.IGNORE_TIKA_EXCEPTION, false);

    MapReduceIndexerTool tool;
    int res;
    QueryResponse results;
    HttpSolrServer server = new HttpSolrServer(cloudJettys.get(0).url);

    String[] args = new String[] { "--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
            "--output-dir=" + outDir.toString(), "--mappers=3",
            ++numRuns % 2 == 0 ? "--input-list=" + INPATH.toString() : dataDir.toString(), "--shard-url",
            cloudJettys.get(0).url, "--shard-url", cloudJettys.get(1).url, "--shard-url",
            cloudJettys.get(2).url, "--go-live-threads", Integer.toString(random().nextInt(15) + 1),
            "--verbose", "--go-live" };
    args = prependInitialArgs(args);

    if (true) {
        tool = new MapReduceIndexerTool();
        res = ToolRunner.run(jobConf, tool, args);
        assertEquals(0, res);
        assertTrue(tool.job.isComplete());
        assertTrue(tool.job.isSuccessful());
        results = server.query(new SolrQuery("*:*"));
        assertEquals(20, results.getResults().getNumFound());
    }

    fs.delete(inDir, true);
    fs.delete(outDir, true);
    fs.delete(dataDir, true);
    assertTrue(fs.mkdirs(inDir));
    INPATH = upAvroFile(fs, inDir, DATADIR, dataDir, inputAvroFile2);

    args = new String[] { "--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
            "--output-dir=" + outDir.toString(), "--mappers=3", "--verbose", "--go-live",
            ++numRuns % 2 == 0 ? "--input-list=" + INPATH.toString() : dataDir.toString(), "--shard-url",
            cloudJettys.get(0).url, "--shard-url", cloudJettys.get(1).url, "--shard-url",
            cloudJettys.get(2).url, "--go-live-threads", Integer.toString(random().nextInt(15) + 1) };
    args = prependInitialArgs(args);

    if (true) {
        tool = new MapReduceIndexerTool();
        res = ToolRunner.run(jobConf, tool, args);
        assertEquals(0, res);
        assertTrue(tool.job.isComplete());
        assertTrue(tool.job.isSuccessful());
        results = server.query(new SolrQuery("*:*"));

        assertEquals(22, results.getResults().getNumFound());
    }

    // try using zookeeper
    String collection = "collection1";
    if (random().nextBoolean()) {
        // sometimes, use an alias
        createAlias("updatealias", "collection1");
        collection = "updatealias";
    }

    fs.delete(inDir, true);
    fs.delete(outDir, true);
    fs.delete(dataDir, true);
    INPATH = upAvroFile(fs, inDir, DATADIR, dataDir, inputAvroFile3);

    args = new String[] { "--output-dir=" + outDir.toString(), "--mappers=3", "--reducers=6", "--verbose",
            "--go-live", ++numRuns % 2 == 0 ? "--input-list=" + INPATH.toString() : dataDir.toString(),
            "--zk-host", zkServer.getZkAddress(), "--collection", collection };
    args = prependInitialArgs(args);

    if (true) {
        tool = new MapReduceIndexerTool();
        res = ToolRunner.run(jobConf, tool, args);
        assertEquals(0, res);
        assertTrue(tool.job.isComplete());
        assertTrue(tool.job.isSuccessful());

        results = server.query(new SolrQuery("*:*"));
        assertEquals(2126, results.getResults().getNumFound());
    }

    server.shutdown();

    // try using zookeeper with replication
    String replicatedCollection = "replicated_collection";
    createCollection(replicatedCollection, 2, 3, 2);
    waitForRecoveriesToFinish(false);
    cloudClient.setDefaultCollection(replicatedCollection);
    fs.delete(inDir, true);
    fs.delete(outDir, true);
    fs.delete(dataDir, true);
    assertTrue(fs.mkdirs(dataDir));
    INPATH = upAvroFile(fs, inDir, DATADIR, dataDir, inputAvroFile3);

    args = new String[] { "--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
            "--output-dir=" + outDir.toString(), "--mappers=3", "--reducers=6", "--verbose", "--go-live",
            "--zk-host", zkServer.getZkAddress(), "--collection", replicatedCollection, dataDir.toString() };
    args = prependInitialArgs(args);

    if (true) {
        tool = new MapReduceIndexerTool();
        res = ToolRunner.run(jobConf, tool, args);
        assertEquals(0, res);
        assertTrue(tool.job.isComplete());
        assertTrue(tool.job.isSuccessful());

        results = cloudClient.query(new SolrQuery("*:*"));
        assertEquals(2104, results.getResults().getNumFound());

        checkConsistency(replicatedCollection);
    }

    // try using solr_url with replication
    cloudClient.deleteByQuery("*:*");
    cloudClient.commit();
    fs.delete(inDir, true);
    fs.delete(dataDir, true);
    assertTrue(fs.mkdirs(dataDir));
    INPATH = upAvroFile(fs, inDir, DATADIR, dataDir, inputAvroFile3);

    args = new String[] { "--solr-home-dir=" + MINIMR_CONF_DIR.getAbsolutePath(),
            "--output-dir=" + outDir.toString(), "--shards", "2", "--mappers=3", "--verbose", "--go-live",
            "--go-live-threads", Integer.toString(random().nextInt(15) + 1), dataDir.toString() };
    args = prependInitialArgs(args);

    List<String> argList = new ArrayList<String>();
    getShardUrlArgs(argList, replicatedCollection);
    args = concat(args, argList.toArray(new String[0]));

    if (true) {
        tool = new MapReduceIndexerTool();
        res = ToolRunner.run(jobConf, tool, args);
        assertEquals(0, res);
        assertTrue(tool.job.isComplete());
        assertTrue(tool.job.isSuccessful());

        checkConsistency(replicatedCollection);

        results = cloudClient.query(new SolrQuery("*:*"));
        assertEquals(2104, results.getResults().getNumFound());
    }

}

From source file:org.apache.sysml.runtime.matrix.CSVReblockMR.java

License:Apache License

public static AssignRowIDMRReturn runAssignRowIDMRJob(String[] inputs, InputInfo[] inputInfos, int[] brlens,
        int[] bclens, String reblockInstructions, int replication, String[] smallestFiles) throws Exception {
    AssignRowIDMRReturn ret = new AssignRowIDMRReturn();
    JobConf job;
    job = new JobConf(CSVReblockMR.class);
    job.setJobName("Assign-RowID-MR");

    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;//from  w ww.  j a v a 2s.c  o  m

    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false,
            ConvertTarget.CELL);

    job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles);

    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions);

    //set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

    //set up custom map/reduce configurations 
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);

    //set up the number of reducers
    job.setNumReduceTasks(1);

    // Print the complete instruction
    //if (LOG.isTraceEnabled())
    //inst.printCompelteMRJobInstruction();

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(CSVAssignRowIDMapper.class);
    job.setMapOutputKeyClass(ByteWritable.class);
    job.setMapOutputValueClass(OffsetCount.class);

    //configure reducer
    job.setReducerClass(CSVAssignRowIDReducer.class);

    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    //set up the output file
    ret.counterFile = new Path(MRJobConfiguration.constructTempOutputFilename());
    job.setOutputFormat(SequenceFileOutputFormat.class);
    FileOutputFormat.setOutputPath(job, ret.counterFile);
    job.setOutputKeyClass(ByteWritable.class);
    job.setOutputValueClass(OffsetCount.class);

    RunningJob runjob = JobClient.runJob(job);

    /* Process different counters */

    Group rgroup = runjob.getCounters().getGroup(NUM_ROWS_IN_MATRIX);
    Group cgroup = runjob.getCounters().getGroup(NUM_COLS_IN_MATRIX);
    ret.rlens = new long[inputs.length];
    ret.clens = new long[inputs.length];
    for (int i = 0; i < inputs.length; i++) {
        // number of non-zeros
        ret.rlens[i] = rgroup.getCounter(Integer.toString(i));
        ret.clens[i] = cgroup.getCounter(Integer.toString(i));
    }
    return ret;
}

From source file:org.apache.sysml.runtime.matrix.CSVReblockMR.java

License:Apache License

private static JobReturn runCSVReblockJob(MRJobInstruction inst, String[] inputs, InputInfo[] inputInfos,
        long[] rlens, long[] clens, int[] brlens, int[] bclens, String reblockInstructions,
        String otherInstructionsInReducer, int numReducers, int replication, byte[] resultIndexes,
        String[] outputs, OutputInfo[] outputInfos, Path counterFile, String[] smallestFiles) throws Exception {
    JobConf job;
    job = new JobConf(ReblockMR.class);
    job.setJobName("CSV-Reblock-MR");

    byte[] realIndexes = new byte[inputs.length];
    for (byte b = 0; b < realIndexes.length; b++)
        realIndexes[b] = b;// w  w  w.  j  a  v a 2  s . c o m

    //set up the input files and their format information
    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, inputs, inputInfos, brlens, bclens, false,
            ConvertTarget.CELL);

    job.setStrings(SMALLEST_FILE_NAME_PER_INPUT, smallestFiles);

    //set up the dimensions of input matrices
    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);

    //set up the block size
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

    //set up the aggregate instructions that will happen in the combiner and reducer
    MRJobConfiguration.setCSVReblockInstructions(job, reblockInstructions);

    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, otherInstructionsInReducer);

    //set up the replication factor for the results
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    //set up custom map/reduce configurations 
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null,
            reblockInstructions, null, otherInstructionsInReducer, resultIndexes);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null,
            reblockInstructions, null, null, otherInstructionsInReducer, resultIndexes, mapoutputIndexes,
            false);

    MatrixCharacteristics[] stats = ret.stats;

    //set up the number of reducers
    int numRed = WriteCSVMR.determineNumReducers(rlens, clens, config.getIntValue(DMLConfig.NUM_REDUCERS),
            ret.numReducerGroups);
    job.setNumReduceTasks(numRed);

    // Print the complete instruction
    //if (LOG.isTraceEnabled())
    //   inst.printCompelteMRJobInstruction(stats);

    // Update resultDimsUnknown based on computed "stats"
    byte[] resultDimsUnknown = new byte[resultIndexes.length];
    for (int i = 0; i < resultIndexes.length; i++) {
        if (stats[i].getRows() == -1 || stats[i].getCols() == -1) {
            resultDimsUnknown[i] = (byte) 1;
        } else {
            resultDimsUnknown[i] = (byte) 0;
        }
    }

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, resultIndexes, resultDimsUnknown, outputs, outputInfos, true,
            true);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(CSVReblockMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(BlockRow.class);

    //configure reducer
    job.setReducerClass(CSVReblockReducer.class);

    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);
    Path cachefile = new Path(counterFile, "part-00000");
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);
    job.set(ROWID_FILE_NAME, cachefile.toString());

    RunningJob runjob = JobClient.runJob(job);

    MapReduceTool.deleteFileIfExistOnHDFS(counterFile, job);

    /* Process different counters */

    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        // number of non-zeros
        stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
        //   System.out.println("result #"+resultIndexes[i]+" ===>\n"+stats[i]);
    }
    return new JobReturn(stats, outputInfos, runjob.isSuccessful());
}

From source file:org.apache.sysml.runtime.matrix.mapred.MRJobConfiguration.java

License:Apache License

public static void setPartitioningInfo(JobConf job, long rlen, long clen, int brlen, int bclen, InputInfo ii,
        OutputInfo oi, PDataPartitionFormat dpf, int n, String fnameNew, boolean keepIndexes)
        throws DMLRuntimeException {
    //set basic partitioning information
    setPartitioningInfo(job, rlen, clen, brlen, bclen, ii, oi, dpf, n, fnameNew);

    //set transpose sparse column vector
    job.setBoolean(PARTITIONING_OUTPUT_KEEP_INDEXES_CONFIG, keepIndexes);
}

From source file:org.apache.sysml.runtime.matrix.sort.PickFromCompactInputFormat.java

License:Apache License

public static Set<Integer> setPickRecordsInEachPartFile(JobConf job, MetaDataNumItemsByEachReducer metadata,
        double[] probs) {
    HashMap<Integer, ArrayList<Pair<Integer, Integer>>> posMap = new HashMap<>();
    getPointsInEachPartFile(metadata.getNumItemsArray(), probs, posMap);

    for (Entry<Integer, ArrayList<Pair<Integer, Integer>>> e : posMap.entrySet()) {
        job.set(SELECTED_POINTS_PREFIX + e.getKey(), getString(e.getValue()));
        //System.out.println(e.getKey()+": "+getString(e.getValue()));
    }//ww  w . j a  v  a 2  s.c  o  m
    job.setBoolean(INPUT_IS_VECTOR, true);
    return posMap.keySet();
}

From source file:org.apache.sysml.runtime.matrix.sort.PickFromCompactInputFormat.java

License:Apache License

public static void setRangePickPartFiles(JobConf job, MetaDataNumItemsByEachReducer metadata, double lbound,
        double ubound) {

    if (lbound < 0 || lbound > 1 || ubound < 0 || ubound > 1 || lbound >= ubound) {
        throw new RuntimeException("Invalid ranges for range pick: [" + lbound + "," + ubound + "]");
    }/*from  ww w . j  a  v  a 2s  .c  o  m*/

    long[] counts = metadata.getNumItemsArray();
    long[] ranges = new long[counts.length];
    ranges[0] = counts[0];
    for (int i = 1; i < counts.length; i++)
        ranges[i] = ranges[i - 1] + counts[i];
    long sumwt = ranges[ranges.length - 1];

    double qbegin = lbound * sumwt;
    double qend = ubound * sumwt;

    // Find part files that overlap with range [qbegin,qend]
    int partID = -1;
    long wt = 0;

    // scan until the part containing qbegin
    while (wt < qbegin) {
        partID++;
        wt += counts[partID];
    }

    StringBuilder sb = new StringBuilder();
    while (wt <= qend) {
        sb.append(partID + "," + (wt - counts[partID]) + ";"); // partID, weight until this part
        partID++;
        if (partID < counts.length)
            wt += counts[partID];
    }
    sb.append(partID + "," + (wt - counts[partID]) + ";");
    sb.append(sumwt + "," + lbound + "," + ubound);

    job.set(SELECTED_RANGES, sb.toString());
    job.setBoolean(INPUT_IS_VECTOR, false);
}

From source file:org.apache.sysml.runtime.matrix.SortMR.java

License:Apache License

@SuppressWarnings({ "unchecked", "rawtypes" })
public static JobReturn runJob(MRJobInstruction inst, String input, InputInfo inputInfo, long rlen, long clen,
        int brlen, int bclen, String combineInst, String sortInst, int numReducers, int replication,
        String output, OutputInfo outputInfo, boolean valueIsWeight) throws Exception {
    boolean sortIndexes = getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes;
    String tmpOutput = sortIndexes ? MRJobConfiguration.constructTempOutputFilename() : output;

    JobConf job = new JobConf(SortMR.class);
    job.setJobName("SortMR");

    //setup partition file
    String pfname = MRJobConfiguration.setUpSortPartitionFilename(job);
    Path partitionFile = new Path(pfname);
    URI partitionUri = new URI(partitionFile.toString());

    //setup input/output paths
    Path inputDir = new Path(input);
    inputDir = inputDir.makeQualified(inputDir.getFileSystem(job));
    FileInputFormat.setInputPaths(job, inputDir);
    Path outpath = new Path(tmpOutput);
    FileOutputFormat.setOutputPath(job, outpath);
    MapReduceTool.deleteFileIfExistOnHDFS(outpath, job);

    //set number of reducers (1 if local mode)
    if (!InfrastructureAnalyzer.isLocalMode(job)) {
        MRJobConfiguration.setNumReducers(job, numReducers, numReducers);
        //ensure partition size <= 10M records to avoid scalability bottlenecks
        //on cp-side qpick instructions for quantile/iqm/median (~128MB)
        if (!(getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes))
            job.setNumReduceTasks((int) Math.max(job.getNumReduceTasks(), rlen / 10000000));
    } else //in case of local mode
        job.setNumReduceTasks(1);/*from   w  w  w . ja  v  a 2s  .com*/

    //setup input/output format
    job.setInputFormat(SamplingSortMRInputFormat.class);
    SamplingSortMRInputFormat.setTargetKeyValueClasses(job,
            (Class<? extends WritableComparable>) outputInfo.outputKeyClass, outputInfo.outputValueClass);

    //setup instructions and meta information
    if (combineInst != null && !combineInst.trim().isEmpty())
        job.set(COMBINE_INSTRUCTION, combineInst);
    job.set(SORT_INSTRUCTION, sortInst);
    job.setBoolean(VALUE_IS_WEIGHT, valueIsWeight);
    boolean desc = getSortInstructionDescending(sortInst);
    job.setBoolean(SORT_DECREASING, desc);
    MRJobConfiguration.setBlockSize(job, (byte) 0, brlen, bclen);
    MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
    int partitionWith0 = SamplingSortMRInputFormat.writePartitionFile(job, partitionFile);

    //setup mapper/reducer/partitioner/output classes
    if (getSortInstructionType(sortInst) == SortKeys.OperationTypes.Indexes) {
        MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
        job.setOutputFormat(OutputInfo.BinaryBlockOutputInfo.outputFormatClass);
        job.setMapperClass(IndexSortMapper.class);
        job.setReducerClass(IndexSortReducer.class);
        job.setMapOutputKeyClass(!desc ? IndexSortComparable.class : IndexSortComparableDesc.class);
        job.setMapOutputValueClass(LongWritable.class);
        job.setOutputKeyClass(MatrixIndexes.class);
        job.setOutputValueClass(MatrixBlock.class);
    } else { //default case: SORT w/wo weights
        MRJobConfiguration.setInputInfo(job, (byte) 0, inputInfo, brlen, bclen, ConvertTarget.CELL);
        job.setOutputFormat(CompactOutputFormat.class);
        job.setMapperClass(ValueSortMapper.class);
        job.setReducerClass(ValueSortReducer.class);
        job.setOutputKeyClass(outputInfo.outputKeyClass); //double
        job.setOutputValueClass(outputInfo.outputValueClass); //int
    }
    job.setPartitionerClass(TotalOrderPartitioner.class);

    //setup distributed cache
    DistributedCache.addCacheFile(partitionUri, job);
    DistributedCache.createSymlink(job);

    //setup replication factor
    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

    //set up custom map/reduce configurations 
    DMLConfig config = ConfigurationManager.getDMLConfig();
    MRJobConfiguration.setupCustomMRConfigurations(job, config);

    MatrixCharacteristics[] s = new MatrixCharacteristics[1];
    s[0] = new MatrixCharacteristics(rlen, clen, brlen, bclen);

    // Print the complete instruction
    if (LOG.isTraceEnabled())
        inst.printCompleteMRJobInstruction(s);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    //run mr job
    RunningJob runjob = JobClient.runJob(job);
    Group group = runjob.getCounters().getGroup(NUM_VALUES_PREFIX);
    numReducers = job.getNumReduceTasks();

    //process final meta data
    long[] counts = new long[numReducers];
    long total = 0;
    for (int i = 0; i < numReducers; i++) {
        counts[i] = group.getCounter(Integer.toString(i));
        total += counts[i];
    }

    //add missing 0s back to the results
    long missing0s = 0;
    if (total < rlen * clen) {
        if (partitionWith0 < 0)
            throw new RuntimeException("no partition contains 0, which is wrong!");
        missing0s = rlen * clen - total;
        counts[partitionWith0] += missing0s;
    } else
        partitionWith0 = -1;

    if (sortIndexes) {
        //run builtin job for shifting partially sorted blocks according to global offsets
        //we do this in this custom form since it would not fit into the current structure
        //of systemml to output two intermediates (partially sorted data, offsets) out of a 
        //single SortKeys lop
        boolean success = runjob.isSuccessful();
        if (success) {
            success = runStitchupJob(tmpOutput, rlen, clen, brlen, bclen, counts, numReducers, replication,
                    output);
        }
        MapReduceTool.deleteFileIfExistOnHDFS(tmpOutput);
        MapReduceTool.deleteFileIfExistOnHDFS(pfname);
        return new JobReturn(s[0], OutputInfo.BinaryBlockOutputInfo, success);
    } else {
        MapReduceTool.deleteFileIfExistOnHDFS(pfname);
        return new JobReturn(s[0], counts, partitionWith0, missing0s, runjob.isSuccessful());
    }
}

From source file:org.apache.sysml.runtime.transform.ApplyTfBBMR.java

License:Apache License

public static JobReturn runJob(String inputPath, String rblkInst, String otherInst, String spec,
        String mapsPath, String tmpPath, String outputPath, String partOffsetsFile,
        CSVFileFormatProperties inputDataProperties, long numRows, long numColsBefore, long numColsAfter,
        int replication, String headerLine) throws Exception {

    CSVReblockInstruction rblk = (CSVReblockInstruction) InstructionParser.parseSingleInstruction(rblkInst);

    long[] rlens = new long[] { numRows };
    long[] clens = new long[] { numColsAfter };
    int[] brlens = new int[] { rblk.brlen };
    int[] bclens = new int[] { rblk.bclen };
    byte[] realIndexes = new byte[] { rblk.input };
    byte[] resultIndexes = new byte[] { rblk.output };

    JobConf job = new JobConf(ApplyTfBBMR.class);
    job.setJobName("ApplyTfBB");

    /* Setup MapReduce Job */
    job.setJarByClass(ApplyTfBBMR.class);

    // set relevant classes
    job.setMapperClass(ApplyTfBBMapper.class);

    MRJobConfiguration.setUpMultipleInputs(job, realIndexes, new String[] { inputPath },
            new InputInfo[] { InputInfo.CSVInputInfo }, brlens, bclens, false, ConvertTarget.CELL);

    MRJobConfiguration.setMatricesDimensions(job, realIndexes, rlens, clens);
    MRJobConfiguration.setBlocksSizes(job, realIndexes, brlens, bclens);

    MRJobConfiguration.setCSVReblockInstructions(job, rblkInst);

    //set up the instructions that will happen in the reducer, after the aggregation instrucions
    MRJobConfiguration.setInstructionsInReducer(job, otherInst);

    job.setInt(MRConfigurationNames.DFS_REPLICATION, replication);

    //set up preferred custom serialization framework for binary block format
    if (MRJobConfiguration.USE_BINARYBLOCK_SERIALIZATION)
        MRJobConfiguration.addBinaryBlockSerializationFramework(job);

    //set up what matrices are needed to pass from the mapper to reducer
    HashSet<Byte> mapoutputIndexes = MRJobConfiguration.setUpOutputIndexesForMapper(job, realIndexes, null,
            rblkInst, null, otherInst, resultIndexes);

    MatrixChar_N_ReducerGroups ret = MRJobConfiguration.computeMatrixCharacteristics(job, realIndexes, null,
            rblkInst, null, null, null, resultIndexes, mapoutputIndexes, false);

    //set up the number of reducers
    int numRed = WriteCSVMR.determineNumReducers(rlens, clens, ConfigurationManager.getNumReducers(),
            ret.numReducerGroups);/*ww  w.j  av  a2 s  . c om*/
    job.setNumReduceTasks(numRed);

    //set up the multiple output files, and their format information
    MRJobConfiguration.setUpMultipleOutputs(job, new byte[] { rblk.output }, new byte[] { 0 },
            new String[] { outputPath }, new OutputInfo[] { OutputInfo.BinaryBlockOutputInfo }, true, false);

    // configure mapper and the mapper output key value pairs
    job.setMapperClass(ApplyTfBBMapper.class);
    job.setMapOutputKeyClass(TaggedFirstSecondIndexes.class);
    job.setMapOutputValueClass(BlockRow.class);

    //configure reducer
    job.setReducerClass(CSVReblockReducer.class);

    //turn off adaptivemr
    job.setBoolean("adaptivemr.map.enable", false);

    //set unique working dir
    MRJobConfiguration.setUniqueWorkingDir(job);

    // Add transformation metadata file as well as partOffsetsFile to Distributed cache
    DistributedCache.addCacheFile((new Path(mapsPath)).toUri(), job);
    DistributedCache.createSymlink(job);

    Path cachefile = new Path(new Path(partOffsetsFile), "part-00000");
    DistributedCache.addCacheFile(cachefile.toUri(), job);
    DistributedCache.createSymlink(job);

    job.set(MRJobConfiguration.TF_HAS_HEADER, Boolean.toString(inputDataProperties.hasHeader()));
    job.set(MRJobConfiguration.TF_DELIM, inputDataProperties.getDelim());
    // Adding "dummy" string to handle the case of na_strings = ""
    if (inputDataProperties.getNAStrings() != null)
        job.set(MRJobConfiguration.TF_NA_STRINGS, TfUtils.prepNAStrings(inputDataProperties.getNAStrings()));
    job.set(MRJobConfiguration.TF_SPEC, spec);
    job.set(MRJobConfiguration.TF_SMALLEST_FILE, CSVReblockMR.findSmallestFile(job, inputPath));
    job.set(MRJobConfiguration.OUTPUT_MATRICES_DIRS_CONFIG, outputPath);
    job.setLong(MRJobConfiguration.TF_NUM_COLS, numColsBefore);
    job.set(MRJobConfiguration.TF_TXMTD_PATH, mapsPath);
    job.set(MRJobConfiguration.TF_HEADER, headerLine);
    job.set(CSVReblockMR.ROWID_FILE_NAME, cachefile.toString());
    job.set(MRJobConfiguration.TF_TMP_LOC, tmpPath);

    RunningJob runjob = JobClient.runJob(job);

    MapReduceTool.deleteFileIfExistOnHDFS(cachefile, job);

    Group group = runjob.getCounters().getGroup(MRJobConfiguration.NUM_NONZERO_CELLS);
    for (int i = 0; i < resultIndexes.length; i++) {
        ret.stats[i].setNonZeros(group.getCounter(Integer.toString(i)));
    }
    return new JobReturn(ret.stats, runjob.isSuccessful());
}