List of usage examples for org.apache.hadoop.mapreduce Job getConfiguration
public Configuration getConfiguration()
From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorage.java
License:Apache License
/** * Get avro schema from "location" and return the converted * PigSchema.//from w w w . j a v a2 s. c om */ @Override public ResourceSchema getSchema(String location, Job job) throws IOException { /* get avro schema */ AvroStorageLog.funcCall("getSchema"); if (inputAvroSchema == null) { Configuration conf = job.getConfiguration(); // If within a script, you store to one location and read from same // location using AvroStorage getPaths will be empty. Since // getSchema is called during script parsing we don't want to fail // here if path not found Set<Path> paths = AvroStorageUtils.getPaths(location, conf, false); if (!paths.isEmpty()) { setInputAvroSchema(paths, conf); } } if (inputAvroSchema != null) { AvroStorageLog.details("avro input schema:" + inputAvroSchema); /* convert to pig schema */ ResourceSchema pigSchema = AvroSchema2Pig.convert(inputAvroSchema); AvroStorageLog.details("pig input schema:" + pigSchema); if (pigSchema.getFields().length == 1) { pigSchema = pigSchema.getFields()[0].getSchema(); } Properties udfProps = getUDFProperties(); udfProps.put(AVRO_INPUT_SCHEMA_PROPERTY, inputAvroSchema.toString()); udfProps.put(AVRO_INPUT_PIG_SCHEMA_PROPERTY, pigSchema); if (schemaToMergedSchemaMap != null) { HashMap<URI, Map<Integer, Integer>> mergedSchemaMap = new HashMap<URI, Map<Integer, Integer>>(); for (Entry<Path, Map<Integer, Integer>> entry : schemaToMergedSchemaMap.entrySet()) { //Path is not serializable mergedSchemaMap.put(entry.getKey().toUri(), entry.getValue()); } udfProps.put(AVRO_MERGED_SCHEMA_PROPERTY, ObjectSerializer.serialize(mergedSchemaMap)); } return pigSchema; } else { return null; } }
From source file:com.linkedin.cubert.pig.piggybank.storage.avro.PigAvroOutputFormat.java
License:Apache License
/** * Enable output compression using the deflate codec and * specify its level.//from w w w. j a v a 2 s .c o m */ public static void setDeflateLevel(Job job, int level) { FileOutputFormat.setCompressOutput(job, true); job.getConfiguration().setInt(DEFLATE_LEVEL_KEY, level); }
From source file:com.linkedin.cubert.utils.CubertMD.java
License:Open Source License
private static void writeMetaFile(String metaFilePath, HashMap<String, String> metaFileKeyValues) throws IOException { Job tempjob = new Job(); Configuration tempconf = tempjob.getConfiguration(); FileSystem fs = FileSystem.get(tempconf); FSDataOutputStream outStream = fs.create(new Path(metaFilePath + "/.meta")); for (String key : metaFileKeyValues.keySet()) outStream.write((key + " " + metaFileKeyValues.get(key) + "\n").getBytes()); outStream.flush();//from ww w . ja v a2s . c om outStream.close(); }
From source file:com.linkedin.cubert.utils.CubertMD.java
License:Open Source License
public static HashMap<String, String> readMetafile(String metaFilePath) throws IOException { Job tempjob = new Job(); Configuration tempconf = tempjob.getConfiguration(); FileSystem fs = FileSystem.get(tempconf); HashMap<String, String> result = new HashMap<String, String>(); FSDataInputStream inStream;//from w w w . ja va 2 s .c o m try { inStream = fs.open(new Path(metaFilePath + "/.meta")); BufferedReader breader = new BufferedReader(new InputStreamReader(inStream)); String line; while ((line = breader.readLine()) != null) { String[] splits = line.split("\\s+"); result.put(splits[0], splits[1]); } } catch (IOException e) { return result; } return result; }
From source file:com.linkedin.hadoop.example.WordCountCounters.java
License:Apache License
/** * Azkaban will look for a method named `run` to start your job. Use this method to setup all the * Hadoop-related configuration for your job and submit it. * * @throws Exception If there is an exception during the configuration or submission of your job *///from w w w .ja v a 2 s . c o m public void run() throws Exception { _logger.info(String.format("Configuring job for the class %s", getClass().getSimpleName())); Job job = Job.getInstance(getConf()); job.setJarByClass(WordCountJob.class); job.setJobName(_name); job.setMapperClass(WordCountMapper.class); job.setCombinerClass(WordCountCombiner.class); job.setReducerClass(WordCountReducer.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); String inputPath = _properties.getProperty("input.path"); String outputPath = _properties.getProperty("output.path"); boolean forceOverwrite = Boolean.parseBoolean(_properties.getProperty("force.output.overwrite", "false")); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); // Before we submit the job, remove the old the output directory if (forceOverwrite) { FileSystem fs = FileSystem.get(job.getConfiguration()); fs.delete(FileOutputFormat.getOutputPath(job), true); } // Since we have Kerberos enabled at LinkedIn, we must add the token to our configuration. If // you don't use Kerberos security for your Hadoop cluster, you don't need this code. if (System.getenv("HADOOP_TOKEN_FILE_LOCATION") != null) { job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION")); } // Submit the job for execution _logger.info(String.format("About to submit the job named %s", _name)); boolean succeeded = job.waitForCompletion(true); // Before we return, display our custom counters for the job in the Azkaban logs long inputWords = job.getCounters().findCounter(WordCountCounters.INPUT_WORDS).getValue(); _logger.info(String.format("Read a total of %d input words", inputWords)); // Azkaban will not realize the Hadoop job failed unless you specifically throw an exception if (!succeeded) { throw new Exception(String.format("Azkaban job %s failed", _name)); } }
From source file:com.linkedin.mr_kluj.GenericClojureJob.java
License:Apache License
public void run() { info("Starting " + getClass().getSimpleName()); /*** Get clojure source ***/ final String cljSource; if (props.getProperty(LI_CLJ_SOURCE) == null) { final String resourceName = props.getProperty("li.clj.source.file"); if (resourceName == null) { throw new RuntimeException( "Must define either li.clj.source or li.clj.source.file on the Props object."); }/*from w w w. ja va 2s.co m*/ URL resource = getClass().getClassLoader().getResource(resourceName); if (resource == null) { // Perhaps it's a URL for a Hadoop-understood file-system try { resource = getScriptFromPath(new Configuration(), resourceName).toURI().toURL(); } catch (Exception e) { // perhaps it wasn't... } } if (resource == null) { // Maybe it's a file File theFile = new File(resourceName); if (theFile.exists()) { try { resource = theFile.toURI().toURL(); } catch (MalformedURLException e) { throw new RuntimeException("WTF?", e); } } } if (resource == null) { throw new RuntimeException( String.format("Resource[%s] does not exist on the classpath.", resourceName)); } try { cljSource = new String(getBytes(resource.openStream())); } catch (IOException e) { throw new RuntimeException(e); } props.setProperty(LI_CLJ_SOURCE, cljSource); } else { cljSource = props.getProperty(LI_CLJ_SOURCE); } final String theActualFunction = String.format( "(require '[com.linkedin.mr-kluj.job :as job])\n\n" + "%s\n" + "(map job/starter the-jobs)\n", cljSource); info("--- Source: ---"); info(theActualFunction); info(" --------- "); boolean jobCompleted; try { RT.var("clojure.core", "require").invoke(Symbol.intern("clojure.main")); Var.pushThreadBindings(RT.map(RT.var("clojure.core", "*warn-on-reflection*"), RT.T, RT.var("user", "*context*"), null, RT.var("user", "*props*"), props)); Iterable<IFn> jobs = (Iterable<IFn>) clojure.lang.Compiler.load(new StringReader(theActualFunction), "start-job-input", "clj-job"); int count = 0; for (IFn ifn : jobs) { Job job = (Job) ifn.invoke(); job.getConfiguration().set(LI_CLJ_SOURCE, cljSource); job.getConfiguration().set(LI_CLJ_JOB_INDEX, String.valueOf(count)); ByteArrayOutputStream baos = new ByteArrayOutputStream(1024 * 10); props.storeToXML(baos, null); job.getConfiguration().set(LI_CLJ_PROPERTIES, new String(baos.toByteArray())); info(String.format("Starting job %s[%s]", job.getJobID(), job.getJobName())); jobCompleted = job.waitForCompletion(true); ++count; if (!jobCompleted) { throw new RuntimeException(String.format("Job[%s] failed for some reason.", job.getJobID())); } } } catch (Exception e) { throw new RuntimeException(e); } }
From source file:com.linkedin.pinot.hadoop.io.PinotOutputFormat.java
License:Apache License
public static void setTempSegmentDir(Job job, String segmentDir) { job.getConfiguration().set(PinotOutputFormat.TEMP_SEGMENT_DIR, segmentDir); }
From source file:com.linkedin.pinot.hadoop.io.PinotOutputFormat.java
License:Apache License
public static void setTableName(Job job, String table) { job.getConfiguration().set(PinotOutputFormat.TABLE_NAME, table); }
From source file:com.linkedin.pinot.hadoop.io.PinotOutputFormat.java
License:Apache License
public static void setSegmentName(Job job, String segmentName) { job.getConfiguration().set(PinotOutputFormat.SEGMENT_NAME, segmentName); }
From source file:com.linkedin.pinot.hadoop.io.PinotOutputFormat.java
License:Apache License
public static void setSchema(Job job, Schema schema) { job.getConfiguration().set(PinotOutputFormat.SCHEMA, schema.getJSONSchema()); }