List of usage examples for org.apache.hadoop.mapred JobConf get
public String get(String name)
name
property, null
if no such property exists. From source file:IndexService.IndexMergeIFormatWriter.java
License:Open Source License
public IndexMergeIFormatWriter(String fileName, JobConf job) throws IOException { this.conf = job; ifdf = new IFormatDataFile(job); ihead = new IHead(); String[] fieldStrings = job.getStrings(ConstVar.HD_fieldMap); IFieldMap fieldMap = new IFieldMap(); for (int i = 0; i < fieldStrings.length; i++) { String[] def = fieldStrings[i].split(ConstVar.RecordSplit); byte type = Byte.valueOf(def[0]); int index = Short.valueOf(def[1]); fieldMap.addFieldType(new IRecord.IFType(type, index)); }/*from w ww . j a v a 2s .co m*/ ihead.setFieldMap(fieldMap); String[] files = job.getStrings(ConstVar.HD_index_filemap); IUserDefinedHeadInfo iudhi = new IUserDefinedHeadInfo(); iudhi.addInfo(123456, job.get("datafiletype")); for (int i = 0; i < files.length; i++) { iudhi.addInfo(i, files[i]); } ihead.setUdi(iudhi); ihead.setPrimaryIndex(0); ifdf.create(fileName, ihead); record = ifdf.getIRecordObj(); }
From source file:infinidb.hadoop.db.IDBFileInputFormat.java
License:Apache License
@Override public RecordReader<NullWritable, NullWritable> getRecordReader(InputSplit arg0, JobConf arg1, Reporter arg2) throws IOException { final String filename = ((FileSplit) arg0).getPath().toString(); final JobConf job = arg1; return new RecordReader<NullWritable, NullWritable>() { private boolean unread = true; @Override/*from w ww .j a v a2 s.c o m*/ public void close() throws IOException { } @Override public NullWritable createKey() { return NullWritable.get(); } @Override public NullWritable createValue() { return NullWritable.get(); } @Override public long getPos() throws IOException { return 0; } @Override public float getProgress() throws IOException { return unread ? 0 : 1; } @Override /* spawn a cpimport process for each input file */ public boolean next(NullWritable arg0, NullWritable arg1) throws IOException { InfiniDBConfiguration dbConf = new InfiniDBConfiguration(job); String schemaName = dbConf.getOutputSchemaName(); String tableName = (filename.substring(filename.lastIndexOf('/') + 1, filename.length())); tableName = tableName.substring(0, tableName.lastIndexOf('.')); String output = job.get("mapred.output.dir"); if (unread) { try { StringBuilder loadCmdStr = new StringBuilder(); loadCmdStr.append(dbConf.getInfiniDBHome()); loadCmdStr.append("/bin/"); loadCmdStr.append("infinidoop_load.sh "); loadCmdStr.append(filename); loadCmdStr.append(" "); loadCmdStr.append(schemaName); loadCmdStr.append(" "); loadCmdStr.append(tableName); Process lChldProc = Runtime.getRuntime().exec(loadCmdStr.toString()); // Wait for the child to exit lChldProc.waitFor(); BufferedReader lChldProcOutStream = new BufferedReader( new InputStreamReader(lChldProc.getInputStream())); BufferedReader stdError = new BufferedReader( new InputStreamReader(lChldProc.getErrorStream())); String lChldProcOutPutStr = null; StringBuffer outpath = new StringBuffer(); outpath.append(job.getWorkingDirectory()); outpath.append("/"); outpath.append(output); outpath.append("/"); outpath.append(tableName); outpath.append(".log"); Path pt = new Path(outpath.toString()); FileSystem fs = FileSystem.get(new Configuration()); BufferedWriter br = new BufferedWriter(new OutputStreamWriter(fs.create(pt, false))); // catch output while ((lChldProcOutPutStr = lChldProcOutStream.readLine()) != null) { br.write(lChldProcOutPutStr); br.newLine(); } // catch error while ((lChldProcOutPutStr = stdError.readLine()) != null) { br.write(lChldProcOutPutStr); br.newLine(); } //br.write(outpath.toString()); //br.newLine(); //br.write(loadCmdStr.toString()); //br.newLine(); //br.write(filename); br.close(); lChldProcOutStream.close(); } catch (Exception e) { e.printStackTrace(); } unread = false; return true; } else { return false; } } }; }
From source file:io.apigee.trireme.samples.hadoop.HadoopBase.java
License:Open Source License
@Override public void configure(JobConf conf) { super.configure(conf); // Noderunner works much better and more node-like if the script is a pointer on the local // filesystem. This means we probably have to do something to make this work // in "real" Hadoop which is distributed. scriptFileName = conf.get(SCRIPT_FILE_KEY); }
From source file:it.crs4.pydoop.pipes.Application.java
License:Apache License
/** * Start the child process to handle the task for us. * @param conf the task's configuration//from w w w . ja v a 2s . c om * @param recordReader the fake record reader to update progress with * @param output the collector to send output to * @param reporter the reporter for the task * @param outputKeyClass the class of the output keys * @param outputValueClass the class of the output values * @throws IOException * @throws InterruptedException */ Application(JobConf conf, RecordReader<FloatWritable, NullWritable> recordReader, OutputCollector<K2, V2> output, Reporter reporter, Class<? extends K2> outputKeyClass, Class<? extends V2> outputValueClass) throws IOException, InterruptedException { serverSocket = new ServerSocket(0); Map<String, String> env = new HashMap<String, String>(); // add TMPDIR environment variable with the value of java.io.tmpdir env.put("TMPDIR", System.getProperty("java.io.tmpdir")); env.put(Submitter.PORT, Integer.toString(serverSocket.getLocalPort())); TaskAttemptID taskid = TaskAttemptID.forName(conf.get(MRJobConfig.TASK_ATTEMPT_ID)); // get the task's working directory String workDir = LocalJobRunner.getLocalTaskDir(conf.getUser(), taskid.getJobID().toString(), taskid.getTaskID().toString(), false); //Add token to the environment if security is enabled Token<JobTokenIdentifier> jobToken = TokenCache.getJobToken(conf.getCredentials()); // This password is used as shared secret key between this application and // child pipes process byte[] password = jobToken.getPassword(); String localPasswordFile = new File(workDir, "jobTokenPassword").getAbsolutePath(); writePasswordToLocalFile(localPasswordFile, password, conf); env.put("hadoop.pipes.shared.secret.location", localPasswordFile); List<String> cmd = new ArrayList<String>(); String interpretor = conf.get(Submitter.INTERPRETOR); if (interpretor != null) { cmd.add(interpretor); } String executable = DistributedCache.getLocalCacheFiles(conf)[0].toString(); if (!(new File(executable).canExecute())) { // LinuxTaskController sets +x permissions on all distcache files already. // In case of DefaultTaskController, set permissions here. FileUtil.chmod(executable, "u+x"); } cmd.add(executable); // wrap the command in a stdout/stderr capture // we are starting map/reduce task of the pipes job. this is not a cleanup // attempt. File stdout = TaskLog.getTaskLogFile(taskid, false, TaskLog.LogName.STDOUT); File stderr = TaskLog.getTaskLogFile(taskid, false, TaskLog.LogName.STDERR); long logLength = TaskLog.getTaskLogLength(conf); cmd = TaskLog.captureOutAndError(null, cmd, stdout, stderr, logLength, false); process = runClient(cmd, env); clientSocket = serverSocket.accept(); String challenge = getSecurityChallenge(); String digestToSend = createDigest(password, challenge); String digestExpected = createDigest(password, digestToSend); handler = new OutputHandler<K2, V2>(output, reporter, recordReader, digestExpected); K2 outputKey = (K2) ReflectionUtils.newInstance(outputKeyClass, conf); V2 outputValue = (V2) ReflectionUtils.newInstance(outputValueClass, conf); downlink = new BinaryProtocol<K1, V1, K2, V2>(clientSocket, handler, outputKey, outputValue, conf); downlink.authenticate(digestToSend, challenge); waitForAuthentication(); LOG.debug("Authentication succeeded"); downlink.start(); downlink.setJobConf(conf); }
From source file:it.crs4.pydoop.pipes.Submitter.java
License:Apache License
/** * Get the URI of the application's executable. * @param conf/* w ww.ja v a 2 s . c om*/ * @return the URI where the application's executable is located */ public static String getExecutable(JobConf conf) { return conf.get(Submitter.EXECUTABLE); }
From source file:it.crs4.pydoop.pipes.Submitter.java
License:Apache License
/** * Set the configuration, if it doesn't already have a value for the given * key.//from w w w . j a v a 2s. c o m * @param conf the configuration to modify * @param key the key to set * @param value the new "default" value to set */ private static void setIfUnset(JobConf conf, String key, String value) { if (conf.get(key) == null) { conf.set(key, value); } }
From source file:ivory.core.index.MergeGlobalStatsAcrossIndexSegments.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class); FileSystem fs = FileSystem.get(conf); String collectionName = conf.get("Ivory.CollectionName"); String indexPaths = conf.get("Ivory.IndexPaths"); String dataOutputPath = conf.get("Ivory.DataOutputPath"); int dfThreshold = conf.getInt("Ivory.DfThreshold", 0); // first, compute size of global term space Path tmpPaths = new Path("/tmp/index-paths.txt"); FSDataOutputStream out = fs.create(tmpPaths, true); for (String s : indexPaths.split(",")) { out.write(new String(s + "\n").getBytes()); }/*from w w w .j a v a2 s .c o m*/ out.close(); LOG.info("Job: ComputeNumberOfTermsAcrossIndexSegments"); conf.setJobName("ComputeNumberOfTermsAcrossIndexSegments:" + collectionName); FileInputFormat.addInputPath(conf, tmpPaths); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(NLineInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(IdentityReducer.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); Counters counters = job.getCounters(); long totalNumTerms = counters.findCounter("org.apache.hadoop.mapred.Task$Counter", 6, "REDUCE_INPUT_GROUPS") .getCounter(); LOG.info("total number of terms in global dictionary = " + totalNumTerms); // now build the dictionary fs.delete(new Path(dataOutputPath), true); conf = new JobConf(getConf(), MergeGlobalStatsAcrossIndexSegments.class); LOG.info("Job: MergeGlobalStatsAcrossIndexSegments"); conf.setJobName("MergeGlobalStatsAcrossIndexSegments:" + collectionName); FileInputFormat.addInputPath(conf, tmpPaths); conf.setNumMapTasks(1); conf.setNumReduceTasks(1); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(NLineInputFormat.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(PairOfIntLong.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); conf.setLong("Ivory.IndexNumberOfTerms", (int) totalNumTerms); startTime = System.currentTimeMillis(); job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); // compute some # docs, collection length, avg doc length long collectionLength = 0; int docCount = 0; for (String index : indexPaths.split(",")) { LOG.info("reading stats for " + index); RetrievalEnvironment env = new RetrievalEnvironment(index, fs); long l = env.readCollectionLength(); int n = env.readCollectionDocumentCount(); LOG.info(" - CollectionLength: " + l); LOG.info(" - CollectionDocumentCount: " + n); collectionLength += l; docCount += n; } float avgdl = (float) collectionLength / docCount; LOG.info("all index segments: "); LOG.info(" - CollectionLength: " + collectionLength); LOG.info(" - CollectionDocumentCount: " + docCount); LOG.info(" - AverageDocumentLenght: " + avgdl); RetrievalEnvironment env = new RetrievalEnvironment(dataOutputPath, fs); env.writeCollectionAverageDocumentLength(avgdl); env.writeCollectionLength(collectionLength); env.writeCollectionDocumentCount(docCount); return 0; }
From source file:ivory.core.preprocess.BuildWeightedIntDocVectors.java
License:Apache License
@SuppressWarnings("deprecation") public int runTool() throws Exception { sLogger.setLevel(Level.WARN); sLogger.info("PowerTool: GetWeightedIntDocVectors"); // create a new JobConf, inheriting from the configuration of this // PowerTool/*from w w w . j av a2 s . c om*/ JobConf conf = new JobConf(getConf(), BuildWeightedIntDocVectors.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String outputPath = env.getWeightedIntDocVectorsDirectory(); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); String collectionName = conf.get("Ivory.CollectionName"); sLogger.info("Characteristics of the collection:"); sLogger.info(" - CollectionName: " + collectionName); sLogger.info("Characteristics of the job:"); sLogger.info(" - NumMapTasks: " + mapTasks); sLogger.info(" - MinSplitSize: " + minSplitSize); String dfByIntFilePath = env.getDfByIntData(); String cfByIntFilePath = env.getCfByIntData(); /* add df table to cache */ if (!fs.exists(new Path(dfByIntFilePath))) { throw new RuntimeException("Error, df data file " + dfByIntFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(dfByIntFilePath), conf); /* add cf table to cache */ if (!fs.exists(new Path(cfByIntFilePath))) { throw new RuntimeException("Error, cf data file " + cfByIntFilePath + "doesn't exist!"); } DistributedCache.addCacheFile(new URI(cfByIntFilePath), conf); /* add dl table to cache */ Path docLengthFile = env.getDoclengthsData(); if (!fs.exists(docLengthFile)) { throw new RuntimeException("Error, doc-length data file " + docLengthFile + "doesn't exist!"); } DistributedCache.addCacheFile(docLengthFile.toUri(), conf); Path inputPath = new Path(env.getIntDocVectorsDirectory()); Path weightedVectorsPath = new Path(outputPath); if (fs.exists(weightedVectorsPath)) { sLogger.info("Output path already exists!"); return 0; } //fs.delete(weightedVectirsPath, true); conf.setJobName("GetWeightedIntDocVectors:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(0); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, weightedVectorsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(WeightedIntDocVector.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(WeightedIntDocVector.class); conf.setMapperClass(MyMapper.class); //conf.setInt("mapred.task.timeout",3600000); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); sLogger.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); return 0; }
From source file:ivory.index.BuildIntPostingsForwardIndex.java
License:Apache License
public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildIntPostingsForwardIndex.class); FileSystem fs = FileSystem.get(conf); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); sLogger.info("Tool: BuildIntPostingsForwardIndex"); sLogger.info(" - IndexPath: " + indexPath); sLogger.info(" - CollectionName: " + collectionName); conf.setJobName("BuildIntPostingsForwardIndex:" + collectionName); Path inputPath = new Path(env.getPostingsDirectory()); FileInputFormat.setInputPaths(conf, inputPath); Path postingsIndexPath = new Path(env.getPostingsIndexData()); if (fs.exists(postingsIndexPath)) { sLogger.info("Postings forward index path already exists!"); return 0; }/* w w w. j a va2 s.c om*/ conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(1); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); conf.setInputFormat(SequenceFileInputFormat.class); conf.setMapOutputKeyClass(IntWritable.class); conf.setMapOutputValueClass(Text.class); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(Text.class); conf.setOutputFormat(NullOutputFormat.class); conf.setMapRunnerClass(MyMapRunner.class); conf.setReducerClass(MyReducer.class); JobClient.runJob(conf); return 0; }
From source file:ivory.index.BuildIPInvertedIndexDocSorted.java
License:Apache License
@SuppressWarnings("unused") public int runTool() throws Exception { JobConf conf = new JobConf(getConf(), BuildIPInvertedIndexDocSorted.class); FileSystem fs = FileSystem.get(conf); String indexPath = conf.get("Ivory.IndexPath"); RetrievalEnvironment env = new RetrievalEnvironment(indexPath, fs); String collectionName = env.readCollectionName(); int mapTasks = conf.getInt("Ivory.NumMapTasks", 0); int reduceTasks = conf.getInt("Ivory.NumReduceTasks", 0); int minSplitSize = conf.getInt("Ivory.MinSplitSize", 0); int collectionDocCnt = env.readCollectionDocumentCount(); LOG.info("PowerTool: BuildIPInvertedIndexDocSorted"); LOG.info(" - IndexPath: " + indexPath); LOG.info(" - CollectionName: " + collectionName); LOG.info(" - CollectionDocumentCount: " + collectionDocCnt); LOG.info(" - NumMapTasks: " + mapTasks); LOG.info(" - NumReduceTasks: " + reduceTasks); LOG.info(" - MinSplitSize: " + minSplitSize); if (!fs.exists(new Path(indexPath))) { fs.mkdirs(new Path(indexPath)); }/*from w ww .j a va 2 s. com*/ Path inputPath = new Path(env.getIntDocVectorsDirectory()); Path postingsPath = new Path(env.getPostingsDirectory()); if (fs.exists(postingsPath)) { LOG.info("Postings already exist: no indexing will be performed."); return 0; } conf.setJobName("BuildIPInvertedIndex:" + collectionName); conf.setNumMapTasks(mapTasks); conf.setNumReduceTasks(reduceTasks); conf.setInt("Ivory.CollectionDocumentCount", collectionDocCnt); conf.setInt("mapred.min.split.size", minSplitSize); conf.set("mapred.child.java.opts", "-Xmx2048m"); FileInputFormat.setInputPaths(conf, inputPath); FileOutputFormat.setOutputPath(conf, postingsPath); conf.setInputFormat(SequenceFileInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setMapOutputKeyClass(PairOfInts.class); conf.setMapOutputValueClass(TermPositions.class); conf.setOutputKeyClass(IntWritable.class); conf.setOutputValueClass(PostingsListDocSortedPositional.class); conf.setMapperClass(MyMapper.class); conf.setReducerClass(MyReducer.class); conf.setPartitionerClass(MyPartitioner.class); long startTime = System.currentTimeMillis(); RunningJob job = JobClient.runJob(conf); LOG.info("Job Finished in " + (System.currentTimeMillis() - startTime) / 1000.0 + " seconds"); env.writePostingsType("ivory.data.PostingsListDocSortedPositional"); return 0; }