Example usage for org.apache.hadoop.mapreduce.lib.input NLineInputFormat getNumLinesPerSplit

List of usage examples for org.apache.hadoop.mapreduce.lib.input NLineInputFormat getNumLinesPerSplit

Introduction

In this page you can find the example usage for org.apache.hadoop.mapreduce.lib.input NLineInputFormat getNumLinesPerSplit.

Prototype

public static int getNumLinesPerSplit(JobContext job) 

Source Link

Document

Get the number of lines per split

Usage

From source file:eu.edisonproject.training.tfidf.mapreduce.TermWordFrequency.java

License:Apache License

@Override
public int run(String[] args) throws Exception {
    Configuration jobconf = getConf();

    FileSystem fs = FileSystem.get(jobconf);
    fs.delete(new Path(args[1]), true);
    Path in = new Path(args[0]);
    Path inHdfs = in;/*  w w w .ja v a2 s.  c  o m*/
    if (!jobconf.get(FileSystem.FS_DEFAULT_NAME_KEY).startsWith("file")) {
        inHdfs = new Path(in.getName());
        fs.delete(inHdfs, true);
        fs.copyFromLocalFile(in, inHdfs);
        fs.deleteOnExit(inHdfs);
        FileStatus inHdfsStatus = fs.getFileStatus(inHdfs);
        //            Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Copied: {0} to: {1}", new Object[]{in.toUri(), inHdfsStatus.getPath().toUri()});
    }

    Job job = Job.getInstance(jobconf);
    Path stopwordsLocal = new Path(args[3]);
    stopwords = new Path(stopwordsLocal.getName());
    fs.delete(stopwords, true);
    fs.copyFromLocalFile(stopwordsLocal, stopwords);
    fs.deleteOnExit(stopwords);

    FileStatus stopwordsStatus = fs.getFileStatus(stopwords);
    stopwords = stopwordsStatus.getPath();
    job.addCacheFile(stopwords.toUri());

    Path localDocs = new Path(args[2]);
    Path hdfsDocs = new Path(localDocs.getName());
    fs.mkdirs(hdfsDocs);
    hdfsDocs = fs.getFileStatus(hdfsDocs).getPath();
    fs.delete(hdfsDocs, true);
    //        FileStatus[] stats = fs.listStatus(localDocs);
    File[] stats = new File(localDocs.toString()).listFiles();

    for (File stat : stats) {
        //        for (FileStatus stat : stats) {
        Path filePath = new Path(stat.getAbsolutePath());
        if (FilenameUtils.getExtension(filePath.getName()).endsWith("txt")) {
            Path dest = new Path(hdfsDocs.toUri() + "/" + filePath.getName());
            fs.copyFromLocalFile(filePath, dest);
        }
    }

    job.addCacheFile(hdfsDocs.toUri());

    job.setJarByClass(TermWordFrequency.class);
    job.setJobName("Word Frequency Term Driver");

    FileInputFormat.setInputPaths(job, inHdfs);
    FileOutputFormat.setOutputPath(job, new Path(args[1]));

    //         job.setInputFormatClass(TextInputFormat.class);
    job.setInputFormatClass(NLineInputFormat.class);
    NLineInputFormat.addInputPath(job, inHdfs);
    NLineInputFormat.setNumLinesPerSplit(job, Integer.valueOf(args[4]));
    NLineInputFormat.setMaxInputSplitSize(job, 500);
    Logger.getLogger(TermWordFrequency.class.getName()).log(Level.INFO, "Num. of lines: {0}",
            NLineInputFormat.getNumLinesPerSplit(job));

    job.setMapperClass(TermWordFrequencyMapper.class);

    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(IntWritable.class);

    job.setOutputKeyClass(Text.class);
    job.setOutputValueClass(Integer.class);
    job.setReducerClass(TermWordFrequencyReducer.class);

    return (job.waitForCompletion(true) ? 0 : 1);

}

From source file:io.apigee.lembos.node.types.NLineInputFormatWrap.java

License:Apache License

/**
 * Java wrapper for {@link NLineInputFormat#getNumLinesPerSplit(org.apache.hadoop.mapreduce.JobContext)}.
 *
 * @param ctx the JavaScript context/* ww  w  .ja  v  a2 s .  c  o m*/
 * @param thisObj the 'this' object
 * @param args the function arguments
 * @param func the function being called
 *
 * @return the number of lines per split
 */
@JSStaticFunction
public static Object getNumLinesPerSplit(final Context ctx, final Scriptable thisObj, final Object[] args,
        final Function func) {
    final Object arg0 = args.length >= 1 ? args[0] : Undefined.instance;

    if (args.length < 1) {
        throw Utils.makeError(ctx, thisObj, LembosMessages.ONE_ARG_EXPECTED);
    } else if (!JavaScriptUtils.isDefined(arg0)) {
        throw Utils.makeError(ctx, thisObj, LembosMessages.FIRST_ARG_REQUIRED);
    } else if (!(arg0 instanceof JobWrap)) {
        throw Utils.makeError(ctx, thisObj, LembosMessages.FIRST_ARG_MUST_BE_JOB);
    }

    return NLineInputFormat.getNumLinesPerSplit(((JobWrap) arg0).getJob());
}

From source file:nl.bioinf.wvanhelvoirt.HadoopPhredCalculator.NReadRecordReader.java

License:Open Source License

/**
 * Override method for instantiation.// w ww  .j a v a 2 s .co m
 *
 * @param inputSplit The InputSplit to read.
 * @param context    The context for this task.
 * @throws IOException          Returns default exception.
 * @throws InterruptedException Returns default exception.
 */
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext context)
        throws IOException, InterruptedException {

    // Initialize.
    Configuration conf = context.getConfiguration();
    FileSplit split = (FileSplit) inputSplit;
    Path file = split.getPath();
    FileSystem fs = file.getFileSystem(conf);
    FSDataInputStream infile = fs.open(split.getPath());

    // Use number of lines given by user and set parameters.
    this.NLINESTOPROCESS = NLineInputFormat.getNumLinesPerSplit(context);
    this.maxLineLength = conf.getInt("mapreduce.input.linerecordreader.line.maxlength", Integer.MAX_VALUE);
    this.start = split.getStart();
    this.end = this.start + split.getLength();
    boolean skipFirstLine = false;

    // Skip first line?
    if (this.start != 0) {
        skipFirstLine = true;
        this.start--;
        infile.seek(this.start);
    }
    this.in = new LineReader(infile, conf);
    if (skipFirstLine) {
        this.start += this.in.readLine(new Text(), 0,
                (int) Math.min((long) Integer.MAX_VALUE, this.end - this.start));
    }
    this.pos = this.start;
}

From source file:org.apache.jena.hadoop.rdf.io.input.AbstractNLineFileInputFormat.java

License:Apache License

/**
 * Logically splits the set of input files for the job, splits N lines of
 * the input as one split.// www.  j  a v  a 2  s . c  om
 * 
 * @see FileInputFormat#getSplits(JobContext)
 */
@Override
public final List<InputSplit> getSplits(JobContext job) throws IOException {
    boolean debug = LOGGER.isDebugEnabled();
    if (debug && FileInputFormat.getInputDirRecursive(job)) {
        LOGGER.debug("Recursive searching for input data is enabled");
    }

    List<InputSplit> splits = new ArrayList<InputSplit>();
    int numLinesPerSplit = NLineInputFormat.getNumLinesPerSplit(job);
    for (FileStatus status : listStatus(job)) {
        if (debug) {
            LOGGER.debug("Determining how to split input file/directory {}", status.getPath());
        }
        splits.addAll(NLineInputFormat.getSplitsForFile(status, job.getConfiguration(), numLinesPerSplit));
    }
    return splits;
}