List of usage examples for org.apache.hadoop.mapreduce.lib.input NLineInputFormat getNumLinesPerSplit
public static int getNumLinesPerSplit(JobContext job)
From source file:eu.edisonproject.training.tfidf.mapreduce.TermWordFrequency.java
License:Apache License
@Override public int run(String[] args) throws Exception { Configuration jobconf = getConf(); FileSystem fs = FileSystem.get(jobconf); fs.delete(new Path(args[1]), true); Path in = new Path(args[0]); Path inHdfs = in;/* w w w .ja v a2 s. c o m*/ if (!jobconf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) { inHdfs = new Path(in.getName()); fs.delete(inHdfs, true); fs.copyFromLocalFile(in, inHdfs); fs.deleteOnExit(inHdfs); FileStatus inHdfsStatus = fs.getFileStatus(inHdfs); // Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Copied: {0} to: {1}", new Object[]{in.toUri(), inHdfsStatus.getPath().toUri()}); } Job job = Job.getInstance(jobconf); Path stopwordsLocal = new Path(args[3]); stopwords = new Path(stopwordsLocal.getName()); fs.delete(stopwords, true); fs.copyFromLocalFile(stopwordsLocal, stopwords); fs.deleteOnExit(stopwords); FileStatus stopwordsStatus = fs.getFileStatus(stopwords); stopwords = stopwordsStatus.getPath(); job.addCacheFile(stopwords.toUri()); Path localDocs = new Path(args[2]); Path hdfsDocs = new Path(localDocs.getName()); fs.mkdirs(hdfsDocs); hdfsDocs = fs.getFileStatus(hdfsDocs).getPath(); fs.delete(hdfsDocs, true); // FileStatus[] stats = fs.listStatus(localDocs); File[] stats = new File(localDocs.toString()).listFiles(); for (File stat : stats) { // for (FileStatus stat : stats) { Path filePath = new Path(stat.getAbsolutePath()); if (FilenameUtils.getExtension(filePath.getName()).endsWith("txt")) { Path dest = new Path(hdfsDocs.toUri() + "/" + filePath.getName()); fs.copyFromLocalFile(filePath, dest); } } job.addCacheFile(hdfsDocs.toUri()); job.setJarByClass(TermWordFrequency.class); job.setJobName("Word Frequency Term Driver"); FileInputFormat.setInputPaths(job, inHdfs); FileOutputFormat.setOutputPath(job, new Path(args[1])); // job.setInputFormatClass(TextInputFormat.class); job.setInputFormatClass(NLineInputFormat.class); NLineInputFormat.addInputPath(job, inHdfs); NLineInputFormat.setNumLinesPerSplit(job, Integer.valueOf(args[4])); NLineInputFormat.setMaxInputSplitSize(job, 500); Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Num. of lines: {0}", NLineInputFormat.getNumLinesPerSplit(job)); job.setMapperClass(TermWordFrequencyMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Integer.class); job.setReducerClass(TermWordFrequencyReducer.class); return (job.waitForCompletion(true) ? 0 : 1); }
From source file:io.apigee.lembos.node.types.NLineInputFormatWrap.java
License:Apache License
/** * Java wrapper for {@link NLineInputFormat#getNumLinesPerSplit(org.apache.hadoop.mapreduce.JobContext)}. * * @param ctx the JavaScript context/* ww w .ja v a2 s . c o m*/ * @param thisObj the 'this' object * @param args the function arguments * @param func the function being called * * @return the number of lines per split */ @JSStaticFunction public static Object getNumLinesPerSplit(final Context ctx, final Scriptable thisObj, final Object[] args, final Function func) { final Object arg0 = args.length >= 1 ? args[0] : Undefined.instance; if (args.length < 1) { throw Utils.makeError(ctx, thisObj, LembosMessages.ONE_ARG_EXPECTED); } else if (!JavaScriptUtils.isDefined(arg0)) { throw Utils.makeError(ctx, thisObj, LembosMessages.FIRST_ARG_REQUIRED); } else if (!(arg0 instanceof JobWrap)) { throw Utils.makeError(ctx, thisObj, LembosMessages.FIRST_ARG_MUST_BE_JOB); } return NLineInputFormat.getNumLinesPerSplit(((JobWrap) arg0).getJob()); }
From source file:nl.bioinf.wvanhelvoirt.HadoopPhredCalculator.NReadRecordReader.java
License:Open Source License
/** * Override method for instantiation.// w ww .j a v a 2 s .co m * * @param inputSplit The InputSplit to read. * @param context The context for this task. * @throws IOException Returns default exception. * @throws InterruptedException Returns default exception. */ @Override public void initialize(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { // Initialize. Configuration conf = context.getConfiguration(); FileSplit split = (FileSplit) inputSplit; Path file = split.getPath(); FileSystem fs = file.getFileSystem(conf); FSDataInputStream infile = fs.open(split.getPath()); // Use number of lines given by user and set parameters. this.NLINESTOPROCESS = NLineInputFormat.getNumLinesPerSplit(context); this.maxLineLength = conf.getInt("mapreduce.input.linerecordreader.line.maxlength", Integer.MAX_VALUE); this.start = split.getStart(); this.end = this.start + split.getLength(); boolean skipFirstLine = false; // Skip first line? if (this.start != 0) { skipFirstLine = true; this.start--; infile.seek(this.start); } this.in = new LineReader(infile, conf); if (skipFirstLine) { this.start += this.in.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, this.end - this.start)); } this.pos = this.start; }
From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNLineFileInputFormat.java
License:Apache License
/** * Logically splits the set of input files for the job, splits N lines of * the input as one split.// www. j a v a 2 s . c om * * @see FileInputFormat#getSplits(JobContext) */ @Override public final List<InputSplit> getSplits(JobContext job) throws IOException { boolean debug = LOGGER.isDebugEnabled(); if (debug && FileInputFormat.getInputDirRecursive(job)) { LOGGER.debug("Recursive searching for input data is enabled"); } List<InputSplit> splits = new ArrayList<InputSplit>(); int numLinesPerSplit = NLineInputFormat.getNumLinesPerSplit(job); for (FileStatus status : listStatus(job)) { if (debug) { LOGGER.debug("Determining how to split input file/directory {}", status.getPath()); } splits.addAll(NLineInputFormat.getSplitsForFile(status, job.getConfiguration(), numLinesPerSplit)); } return splits; }