List of usage examples for org.apache.hadoop.mapreduce Counter getValue
long getValue();
From source file:simsql.runtime.MRLoader.java
License:Apache License
public long run(String inputPath, String outputPath, short typeCode, Relation r, int sortAtt) { // make a directory for the relation Configuration conf = new Configuration(); FileSystem dfs = null;/*from w w w .j av a 2 s . c o m*/ try { dfs = FileSystem.get(conf); } catch (Exception e) { throw new RuntimeException("Cannot access HDFS!", e); } try { // if it exists, destroy it. Path path = new Path(outputPath); if (dfs.exists(path)) { dfs.delete(path, true); } } catch (Exception e) { throw new RuntimeException("Could not create the file to bulk load to!", e); } // find a file name String tempPath = null; if (inputPath.startsWith("hdfs:")) { tempPath = inputPath.replace("hdfs:", ""); } else { tempPath = "/tempDataFile_" + r.getName(); try { dfs.delete(new Path(tempPath), true); } catch (Exception e) { // ignore this. } // upload the text file try { dfs.copyFromLocalFile(false, true, new Path(inputPath), new Path(tempPath)); dfs.deleteOnExit(new Path(tempPath)); } catch (Exception e) { throw new RuntimeException("Failed to upload text file " + inputPath + " to HDFS!", e); } } // set up the new job's parameters. conf.setBoolean("mapred.compress.map.output", true); conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass()); conf.set("io.serializations", "simsql.runtime.RecordSerialization,simsql.runtime.RecordKeySerialization,org.apache.hadoop.io.serializer.WritableSerialization"); conf.setInt("simsql.loader.numAtts", r.getAttributes().size()); conf.setInt("simsql.loader.typeCode", (int) typeCode); conf.setInt("simsql.loader.sortAtt", sortAtt); String[] myStrings = new String[r.getAttributes().size()]; int j = 0; for (simsql.compiler.Attribute a : r.getAttributes()) { myStrings[j++] = a.getPhysicalRealization().getClass().getName(); } conf.setStrings("simsql.loader.types", myStrings); // create a job Job job; try { job = new Job(conf); } catch (Exception e) { throw new RuntimeException("Unable to create bulk loading job!", e); } // set the split size (number of mappers) long fSize = 0; if (inputPath.startsWith("hdfs")) { fSize = RelOp.getPathsTotalSize(new String[] { tempPath }); } else { fSize = new File(inputPath).length(); } FileInputFormat.setMinInputSplitSize(job, fSize / (long) numTasks); FileInputFormat.setMaxInputSplitSize(job, fSize / (long) numTasks); // and the number of reducers job.setNumReduceTasks(numTasks); // the mapper/reducer/jar job.setMapperClass(MRLoaderMapper.class); job.setReducerClass(MRLoaderReducer.class); job.setJarByClass(MRLoader.class); // I/O settings. job.setOutputFormatClass(RecordOutputFormat.class); job.setMapOutputKeyClass(RecordKey.class); job.setMapOutputValueClass(RecordWrapper.class); job.setOutputKeyClass(Nothing.class); job.setOutputValueClass(Record.class); try { FileInputFormat.setInputPaths(job, new Path(tempPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); } catch (Exception e) { throw new RuntimeException("Could not set job inputs/outputs", e); } job.setGroupingComparatorClass(RecordKeyGroupingComparator.class); job.setPartitionerClass(RecordPartitioner.class); job.setSortComparatorClass(RecordKeySortComparator.class); job.setJobName("MRLoader: " + inputPath + " ==> " + outputPath); // run it Counters counters; try { job.waitForCompletion(true); counters = job.getCounters(); } catch (Exception e) { throw new RuntimeException("Could not set up bulk loader job!", e); } // now, delete all the empty part files try { // get a filesystem FileSystem ddfs = FileSystem.get(conf); Path outPath = new Path(outputPath); if (ddfs.exists(outPath) && ddfs.isDirectory(outPath)) { FileStatus fstatus[] = ddfs.listStatus(outPath, new TableFileFilter()); for (FileStatus ff : fstatus) { if (ddfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around... ddfs.delete(ff.getPath(), true); } } } } catch (Exception e) { // this isn't disastrous } // get the counter for the output of the mapper. Counter bytesCounter = counters.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN); return bytesCounter.getValue(); }
From source file:simsql.runtime.RelOp.java
License:Apache License
public boolean run(RuntimeParameter params, boolean verbose) { ExampleRuntimeParameter pp = (ExampleRuntimeParameter) params; // build the jar. String jarFile = buildJarFile(params); // Get the default configuration object Configuration conf = new Configuration(); // set quite mode on/off conf.setQuietMode(!verbose);/* w w w . j a va 2 s . com*/ /*** conf.setBoolean("mapred.task.profile", true); conf.set("mapred.task.profile.params", "-agentlib:hprof=cpu=samples," + "heap=sites,depth=8,force=n,thread=y,verbose=n,file=%s"); ***/ // tell it how to serialize and deserialize records and recordkeys conf.set("io.serializations", getSerializations()); conf.setBoolean("mapred.compress.map.output", true); int ioSortMB = conf.getInt("io.sort.mb", 256); conf.set("mapred.map.child.java.opts", "-Xmx" + (getMemPerMapper(params) + ioSortMB) + "m -Xms" + (getMemPerMapper(params)) + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log"); conf.set("mapred.reduce.child.java.opts", "-Xmx" + (getMemPerReducer(params) + ioSortMB) + "m -Xms" + (getMemPerMapper(params)) + "m -Duser.timezone='America/Chicago' -Djava.net.preferIPv4Stack=true -XX:CompileThreshold=10000 -XX:+DoEscapeAnalysis -XX:+UseNUMA -XX:-EliminateLocks -XX:+UseBiasedLocking -XX:+OptimizeStringConcat -XX:+UseFastAccessorMethods -XX:+UseConcMarkSweepGC -XX:+CMSIncrementalMode -XX:+CMSIncrementalPacing -XX:CMSIncrementalDutyCycleMin=0 -XX:+UseCompressedOops -XX:+AggressiveOpts -XX:-UseStringCache -XX:ErrorFile=/tmp/hs_err_pid%p.log"); conf.setInt("simsql.input.numSplits", pp.getNumCPUs()); conf.setInt("mapred.job.reuse.jvm.num.tasks", 1); // conf.setBoolean ("mapred.map.tasks.speculative.execution", false); // conf.setBoolean ("mapred.reduce.tasks.speculative.execution", false); // tell it to use the jar that we just created conf.set("mapred.jar", jarFile); // conf.set("tmpjars", "file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-client-core.jar"); conf.setBoolean("mapred.output.compress", true); conf.setStrings("mapred.output.compression.type", new String[] { "RECORD" }); // use snappy for the intermediate stuff conf.set("mapred.map.output.compression.codec", RecordCompression.getCodecClass()); // do some additional operator-specific configurations setConfigurations(conf, params); // collect statistics for final relations always conf.setBoolean("simsql.collectStats", isFinal || collectStats); // figure out what file to map String[] inDirs = myInputNetwork.getPipelinedInputFiles(); inDirs = excludeAnyWhoWillNotBeMapped(inDirs); String inSingleString = inDirs[0]; conf.set("simsql.fileToMap", inSingleString); for (int i = 1; i < inDirs.length; i++) { inSingleString += "," + inDirs[i]; } // create and name the job Job job; try { job = new Job(conf); } catch (Exception e) { throw new RuntimeException("Unable to create a new job!", e); } job.setJobName(getJobName()); // set the map-reduce input and output types job.setMapOutputKeyClass(getMapOutputKeyClass()); job.setMapOutputValueClass(getMapOutputValueClass()); job.setOutputKeyClass(getOutputKeyClass()); job.setOutputValueClass(getOutputValueClass()); int numReducers = getNumReducers(params); job.setMapperClass(getMapperClass()); job.setReducerClass(getReducerClass()); // set the number of reducers job.setNumReduceTasks(numReducers); // set the input and the output formats... these extend FileInputFormat and FileOutputFormat job.setInputFormatClass(getInputFormatClass()); job.setOutputFormatClass(getOutputFormatClass()); // set the input and output paths try { System.out.println("input file: " + inSingleString); FileInputFormat.setInputPaths(job, inSingleString); FileInputFormat.setInputPathFilter(job, TableFileFilter.class); FileOutputFormat.setOutputPath(job, new Path(getOutput())); } catch (Exception e) { throw new RuntimeException("Unable to set up the input/output path for the job.", e); } // set the split size FileInputFormat.setMinInputSplitSize(job, getSplitSize(params)); FileInputFormat.setMaxInputSplitSize(job, getSplitSize(params)); // set the various sorting/grouping/mapping classes job.setGroupingComparatorClass(getGroupingComparatorClass()); job.setPartitionerClass(getPartitionerClass()); job.setSortComparatorClass(getSortComparatorClass()); // and now, submit the job and wait for things to finish int exitCode; try { exitCode = job.waitForCompletion(verbose) ? 0 : 1; // get the output bytes counter. Counters c = job.getCounters(); Counter mx = c.findCounter(OutputFileSerializer.Counters.BYTES_WRITTEN); // and use them to set the size of the output relation. if (myDB != null) { myDB.setTableSize(myDB.getTableName(getOutput()), mx.getValue()); myDB.setNumAtts(myDB.getTableName(getOutput()), getOutputAttNames().length); } } catch (Exception e) { e.printStackTrace(); throw new RuntimeException("Unable to run the job", e); } // now, delete all the empty part files try { // get a filesystem FileSystem dfs = FileSystem.get(conf); Path outPath = new Path(getOutput()); if (dfs.exists(outPath) && dfs.isDirectory(outPath)) { FileStatus fstatus[] = dfs.listStatus(outPath, new TableFileFilter()); for (FileStatus ff : fstatus) { if (dfs.getContentSummary(ff.getPath()).getLength() <= 4) { // snappy leaves 4-byte long files around... dfs.delete(ff.getPath(), true); } } } } catch (Exception e) { // this isn't disastrous } return (exitCode == 0); }
From source file:sixdegrees.LevelData.java
License:Apache License
public void execute(String args[]) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage: Level Data <in> <out>"); System.exit(2);// ww w . j a va 2 s.com } Job job = new Job(conf, "level data"); job.setJarByClass(LevelData.class); job.setMapperClass(LevelMapper.class); // No Combiner job.setReducerClass(LevelReducer.class); job.setOutputKeyClass(IntWritable.class); job.setOutputValueClass(IntWritable.class); job.setNumReduceTasks(6); FileInputFormat.addInputPath(job, new Path(args[1] + "6")); FileOutputFormat.setOutputPath(job, new Path(args[1] + "final")); job.waitForCompletion(true); Counters counters = job.getCounters(); Counter c1 = (Counter) counters.findCounter(NODE_COUNTER.TOTAL_NODES); int totalNodes = (int) c1.getValue(); Counter c2 = (Counter) counters.findCounter(NODE_COUNTER.DEGREE_COUNT); int totalDegrees = (int) c2.getValue(); System.out.println("Average: " + (float) totalDegrees / (totalNodes - 1)); }
From source file:uk.gov.gchq.gaffer.accumulostore.operation.hdfs.handler.job.tool.SampleDataAndCreateSplitsFileTool.java
License:Apache License
@Override public int run(final String[] strings) throws OperationException { try {/*from w w w . j av a2 s . c o m*/ LOGGER.info("Creating job using SampleDataForSplitPointsJobFactory"); job = new SampleDataForSplitPointsJobFactory().createJob(operation, store); } catch (final IOException e) { LOGGER.error("Failed to create Hadoop job: {}", e.getMessage()); throw new OperationException("Failed to create the Hadoop job: " + e.getMessage(), e); } try { LOGGER.info("Running SampleDataForSplitPoints job (job name is {})", job.getJobName()); job.waitForCompletion(true); } catch (final IOException | InterruptedException | ClassNotFoundException e) { LOGGER.error("Exception running job: {}", e.getMessage()); throw new OperationException("Error while waiting for job to complete: " + e.getMessage(), e); } try { if (!job.isSuccessful()) { LOGGER.error("Job was not successful (job name is {})", job.getJobName()); throw new OperationException("Error running job"); } } catch (final IOException e) { LOGGER.error("Exception running job: {}", e.getMessage()); throw new OperationException("Error running job" + e.getMessage(), e); } // Find the number of records output // NB In the following line use mapred.Task.Counter.REDUCE_OUTPUT_RECORDS rather than // mapreduce.TaskCounter.REDUCE_OUTPUT_RECORDS as this is more compatible with earlier // versions of Hadoop. Counter counter; try { counter = job.getCounters().findCounter(Task.Counter.REDUCE_OUTPUT_RECORDS); LOGGER.info("Number of records output = {}", counter); } catch (final IOException e) { LOGGER.error( "Failed to get counter org.apache.hadoop.mapred.Task.Counter.REDUCE_OUTPUT_RECORDS from job: {}", e.getMessage()); throw new OperationException("Failed to get counter: " + Task.Counter.REDUCE_OUTPUT_RECORDS, e); } int numberTabletServers; try { numberTabletServers = store.getConnection().instanceOperations().getTabletServers().size(); LOGGER.info("Number of tablet servers is {}", numberTabletServers); } catch (final StoreException e) { LOGGER.error("Exception thrown getting number of tablet servers: {}", e.getMessage()); throw new OperationException(e.getMessage(), e); } long outputEveryNthRecord = counter.getValue() / (numberTabletServers - 1); final Path resultsFile = new Path(operation.getOutputPath(), "part-r-00000"); LOGGER.info("Will output every {}-th record from {}", outputEveryNthRecord, resultsFile); // Read through resulting file, pick out the split points and write to file. final Configuration conf = getConf(); final FileSystem fs; try { fs = FileSystem.get(conf); } catch (final IOException e) { LOGGER.error("Exception getting filesystem: {}", e.getMessage()); throw new OperationException("Failed to get filesystem from configuration: " + e.getMessage(), e); } LOGGER.info("Writing splits to {}", operation.getResultingSplitsFilePath()); final Key key = new Key(); final Value value = new Value(); long count = 0; int numberSplitPointsOutput = 0; try (final SequenceFile.Reader reader = new SequenceFile.Reader(fs, resultsFile, conf); final PrintStream splitsWriter = new PrintStream( new BufferedOutputStream(fs.create(new Path(operation.getResultingSplitsFilePath()), true)), false, CommonConstants.UTF_8)) { while (reader.next(key, value) && numberSplitPointsOutput < numberTabletServers - 1) { count++; if (count % outputEveryNthRecord == 0) { LOGGER.debug("Outputting split point number {} ({})", numberSplitPointsOutput, Base64.encodeBase64(key.getRow().getBytes())); numberSplitPointsOutput++; splitsWriter.println( new String(Base64.encodeBase64(key.getRow().getBytes()), CommonConstants.UTF_8)); } } LOGGER.info("Total number of records read was {}", count); } catch (final IOException e) { LOGGER.error("Exception reading results file and outputting split points: {}", e.getMessage()); throw new OperationException(e.getMessage(), e); } try { fs.delete(resultsFile, true); LOGGER.info("Deleted the results file {}", resultsFile); } catch (final IOException e) { LOGGER.error("Failed to delete the results file {}", resultsFile); throw new OperationException("Failed to delete the results file: " + e.getMessage(), e); } return SUCCESS_RESPONSE; }