Example usage for org.apache.hadoop.fs LocatedFileStatus isFile

List of usage examples for org.apache.hadoop.fs LocatedFileStatus isFile

Introduction

In this page you can find the example usage for org.apache.hadoop.fs LocatedFileStatus isFile.

Prototype

public boolean isFile() 

Source Link

Document

Is this a file?

Usage

From source file:ch.cern.db.hdfs.DistributedFileSystemMetadata.java

License:GNU General Public License

public LinkedList<BlockLocation> getBlockLocations(Path path) throws IOException {
    LOG.info("Collecting block locations...");

    LinkedList<BlockLocation> blockLocations = new LinkedList<BlockLocation>();
    RemoteIterator<LocatedFileStatus> statuses = listFiles(path, true);
    int hasNextCode = hasNextCode(statuses);
    while (hasNextCode > 0) {
        if (hasNextCode > 1) {
            hasNextCode = hasNextCode(statuses);
            continue;
        }/*w  w  w . j  av a 2  s.co  m*/

        LocatedFileStatus fileStatus = statuses.next();

        if (fileStatus.isFile()) {
            BlockLocation[] blockLocations_tmp = getFileBlockLocations(fileStatus, 0, fileStatus.getLen());

            blockLocations.addAll(Arrays.asList(blockLocations_tmp));
        }

        int size = blockLocations.size();
        if (size > 0 && size % 5000 == 0)
            LOG.info("Collected " + size + " locations. Still in progress...");

        if (size >= MAX_NUMBER_OF_LOCATIONS) {
            LOG.info("Reached max number of locations to collect. The amount will be representative enough.");
            break;
        }

        hasNextCode = hasNextCode(statuses);
    }
    LOG.info("Collected " + blockLocations.size() + " locations.");

    if (isHdfsBlocksMetadataEnabled()) {
        BlockStorageLocation[] blockStorageLocations = getFileBlockStorageLocations(blockLocations);

        blockLocations.clear();
        blockLocations.addAll(Arrays.asList(blockStorageLocations));
    } else {
        LOG.error("VolumnId/DiskId can not be collected since "
                + "dfs.datanode.hdfs-blocks-metadata.enabled is not enabled.");
    }

    return blockLocations;
}

From source file:com.awcoleman.StandaloneJava.AvroCombinerByBlock.java

License:Apache License

public AvroCombinerByBlock(String inDirStr, String outDirStr, String handleExisting) throws IOException {

    //handle both an output directory and an output filename (ending with .avro)
    String outputFilename = DEFAULTOUTPUTFILENAME;
    if (outDirStr.endsWith(".avro")) {
        isOutputNameSpecifiedAndAFile = true;
        //String[] outputParts = outDirStr.split(":?\\\\");
        String[] outputParts = outDirStr.split("/");

        outputFilename = outputParts[outputParts.length - 1];

        //remove outputFilename from outDirStr to get new outDirStr which is just directory (and trailing /)
        outDirStr = outDirStr.replaceAll(Pattern.quote(outputFilename), "");
        outDirStr = outDirStr.substring(0, outDirStr.length() - (outDirStr.endsWith("/") ? 1 : 0));
    }//from ww  w . j a v a2s .  co  m

    //Get block size - not needed
    //long hdfsBlockSize = getBlockSize();
    //System.out.println("HDFS FS block size: "+hdfsBlockSize);

    //Get list of input files
    ArrayList<FileStatus> inputFileList = new ArrayList<FileStatus>();

    Configuration conf = new Configuration();
    conf.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
    conf.set("dfs.replication", "1"); //see http://stackoverflow.com/questions/24548699/how-to-append-to-an-hdfs-file-on-an-extremely-small-cluster-3-nodes-or-less

    FileSystem hdfs = null;
    try {
        hdfs = FileSystem.get(conf);
    } catch (java.io.IOException ioe) {
        System.out.println("Error opening HDFS filesystem. Exiting. Error message: " + ioe.getMessage());
        System.exit(1);
    }
    if (hdfs.getStatus() == null) {
        System.out.println("Unable to contact HDFS filesystem. Exiting.");
        System.exit(1);
    }

    //Check if input and output dirs exist
    Path inDir = new Path(inDirStr);
    Path outDir = new Path(outDirStr);
    if (!(hdfs.exists(inDir) || hdfs.isDirectory(inDir))) {
        System.out.println("Input directory ( " + inDirStr + " ) not found or is not directory. Exiting.");
        System.exit(1);
    }

    if (!(hdfs.exists(outDir) || hdfs.isDirectory(outDir))) {
        if (hdfs.exists(outDir)) { //outDir exists and is a symlink or file, must die
            System.out.println("Requested output directory name ( " + outDirStr
                    + " ) exists but is not a directory. Exiting.");
            System.exit(1);
        } else {
            hdfs.mkdirs(outDir);
        }
    }

    RemoteIterator<LocatedFileStatus> fileStatusListIterator = hdfs.listFiles(inDir, true);
    while (fileStatusListIterator.hasNext()) {
        LocatedFileStatus fileStatus = fileStatusListIterator.next();

        if (fileStatus.isFile() && !fileStatus.getPath().getName().equals("_SUCCESS")) {
            inputFileList.add((FileStatus) fileStatus);
        }
    }

    if (inputFileList.size() <= 1 && !isOutputNameSpecifiedAndAFile) { //If an output file is specified assume we just want a rename.
        System.out.println("Only one or zero files found in input directory ( " + inDirStr + " ). Exiting.");
        System.exit(1);
    }

    //Get Schema and Compression Codec from seed file since we need it for the writer
    Path firstFile = inputFileList.get(0).getPath();
    FsInput fsin = new FsInput(firstFile, conf);
    DataFileReader<Object> dfrFirstFile = new DataFileReader<Object>(fsin, new GenericDatumReader<Object>());
    Schema fileSchema = dfrFirstFile.getSchema();
    String compCodecName = dfrFirstFile.getMetaString("avro.codec");
    //compCodecName should be null, deflate, snappy, or bzip2
    if (compCodecName == null) {
        compCodecName = "deflate"; //set to deflate even though original is no compression
    }
    dfrFirstFile.close();

    //Create Empty HDFS file in output dir
    String seedFileStr = outDirStr + "/" + outputFilename;
    Path seedFile = new Path(seedFileStr);
    FSDataOutputStream hdfsdos = null;
    try {
        hdfsdos = hdfs.create(seedFile, false);
    } catch (org.apache.hadoop.fs.FileAlreadyExistsException faee) {
        if (handleExisting.equals("overwrite")) {
            hdfs.delete(seedFile, false);
            hdfsdos = hdfs.create(seedFile, false);
        } else if (handleExisting.equals("append")) {
            hdfsdos = hdfs.append(seedFile);
        } else {
            System.out
                    .println("File " + seedFileStr + " exists and will not overwrite. handleExisting is set to "
                            + handleExisting + ". Exiting.");
            System.exit(1);
        }
    }
    if (hdfsdos == null) {
        System.out.println("Unable to create or write to output file ( " + seedFileStr
                + " ). handleExisting is set to " + handleExisting + ". Exiting.");
        System.exit(1);
    }

    //Append other files
    GenericDatumWriter gdw = new GenericDatumWriter(fileSchema);
    DataFileWriter dfwBase = new DataFileWriter(gdw);
    //Set compression to that found in the first file
    dfwBase.setCodec(CodecFactory.fromString(compCodecName));

    DataFileWriter dfw = dfwBase.create(fileSchema, hdfsdos);
    for (FileStatus thisFileStatus : inputFileList) {

        //_SUCCESS files are 0 bytes
        if (thisFileStatus.getLen() == 0) {
            continue;
        }

        FsInput fsin1 = new FsInput(thisFileStatus.getPath(), conf);
        DataFileReader dfr = new DataFileReader<Object>(fsin1, new GenericDatumReader<Object>());

        dfw.appendAllFrom(dfr, false);

        dfr.close();
    }

    dfw.close();
    dfwBase.close();

}

From source file:com.awcoleman.StandaloneJava.AvroCombinerByRecord.java

License:Apache License

public AvroCombinerByRecord(String inDirStr, String outDirStr) throws IOException {

    //Get list of input files
    ArrayList<FileStatus> inputFileList = new ArrayList<FileStatus>();

    Configuration conf = new Configuration();
    conf.addResource(new Path("/etc/hadoop/conf/core-site.xml"));

    FileSystem hdfs = FileSystem.get(conf);

    //Check if input and output dirs exist
    Path inDir = new Path(inDirStr);
    Path outDir = new Path(outDirStr);
    if (!(hdfs.exists(inDir) || hdfs.isDirectory(inDir))) {
        System.out.println("Input directory ( " + inDirStr + " ) not found or is not directory. Exiting.");
        System.exit(1);/*  w w  w . ja  v a  2s.  co m*/
    }

    if (!(hdfs.exists(outDir) || hdfs.isDirectory(outDir))) {
        if (hdfs.exists(outDir)) { //outDir exists and is a symlink or file, must die
            System.out.println("Requested output directory name ( " + outDirStr
                    + " ) exists but is not a directory. Exiting.");
            System.exit(1);
        } else {
            hdfs.mkdirs(outDir);
        }
    }

    RemoteIterator<LocatedFileStatus> fileStatusListIterator = hdfs.listFiles(inDir, true);
    while (fileStatusListIterator.hasNext()) {
        LocatedFileStatus fileStatus = fileStatusListIterator.next();

        if (fileStatus.isFile()) {
            inputFileList.add((FileStatus) fileStatus);
        }
    }

    if (inputFileList.size() <= 1) {
        System.out.println("Only one or zero files found in input directory ( " + inDirStr + " ). Exiting.");
        System.exit(1);
    }

    //Get Schema and Compression Codec from seed file since we need it for the writer
    Path firstFile = inputFileList.get(0).getPath();
    FsInput fsin = new FsInput(firstFile, conf);
    DataFileReader<Object> dfrFirstFile = new DataFileReader<Object>(fsin, new GenericDatumReader<Object>());
    Schema fileSchema = dfrFirstFile.getSchema();
    String compCodecName = dfrFirstFile.getMetaString("avro.codec");
    dfrFirstFile.close();

    //Create Empty HDFS file in output dir
    Path seedFile = new Path(outDirStr + "/combinedByRecord.avro");
    FSDataOutputStream hdfsdos = hdfs.create(seedFile, false);

    //Append other files
    GenericDatumWriter gdw = new GenericDatumWriter(fileSchema);
    DataFileWriter dfwBase = new DataFileWriter(gdw);
    //Set compression to that found in the first file
    dfwBase.setCodec(CodecFactory.fromString(compCodecName));

    DataFileWriter dfw = dfwBase.create(fileSchema, hdfsdos);

    for (FileStatus thisFileStatus : inputFileList) {

        DataFileStream<Object> avroStream = null;
        FSDataInputStream inStream = hdfs.open(thisFileStatus.getPath());
        GenericDatumReader<Object> reader = new GenericDatumReader<Object>();
        avroStream = new DataFileStream<Object>(inStream, reader);

        long recordCounter = 0;
        while (avroStream.hasNext()) {
            dfw.append(avroStream.next());

            recordCounter++;
        }
        avroStream.close();
        inStream.close();

        System.out.println("Appended " + recordCounter + " records from " + thisFileStatus.getPath().getName()
                + " to " + seedFile.getName());
    }

    dfw.close();
    dfwBase.close();
}

From source file:com.awcoleman.StandaloneJava.AvroCounterByBlock.java

License:Apache License

public AvroCounterByBlock(String inDirStr) throws IOException {

    long numAvroRecords = 0;

    //Get list of input files
    ArrayList<FileStatus> inputFileList = new ArrayList<FileStatus>();

    Configuration conf = new Configuration();
    conf.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
    conf.set("dfs.replication", "1"); //see http://stackoverflow.com/questions/24548699/how-to-append-to-an-hdfs-file-on-an-extremely-small-cluster-3-nodes-or-less

    FileSystem hdfs = null;//from w ww .  ja va  2 s.  c om
    try {
        hdfs = FileSystem.get(conf);
    } catch (java.io.IOException ioe) {
        System.out.println("Error opening HDFS filesystem. Exiting. Error message: " + ioe.getMessage());
        System.exit(1);
    }
    if (hdfs.getStatus() == null) {
        System.out.println("Unable to contact HDFS filesystem. Exiting.");
        System.exit(1);
    }

    //Check if input dirs/file exists and get file list (even if list of single file)
    Path inPath = new Path(inDirStr);
    if (hdfs.exists(inPath) && hdfs.isFile(inPath)) { //single file
        inputFileList.add(hdfs.getFileStatus(inPath));
    } else if (hdfs.exists(inPath) && hdfs.isDirectory(inPath)) { //dir
        //Get list of input files
        RemoteIterator<LocatedFileStatus> fileStatusListIterator = hdfs.listFiles(inPath, true);
        while (fileStatusListIterator.hasNext()) {
            LocatedFileStatus fileStatus = fileStatusListIterator.next();

            if (fileStatus.isFile() && !fileStatus.getPath().getName().equals("_SUCCESS")) {
                inputFileList.add((FileStatus) fileStatus);
            }
        }
    } else {
        System.out.println("Input directory ( " + inDirStr + " ) not found or is not directory. Exiting.");
        System.exit(1);
    }

    for (FileStatus thisFileStatus : inputFileList) {

        //_SUCCESS files are 0 bytes
        if (thisFileStatus.getLen() == 0) {
            continue;
        }

        DataFileStream<Object> dfs = null;
        FSDataInputStream inStream = hdfs.open(thisFileStatus.getPath());
        GenericDatumReader<Object> reader = new GenericDatumReader<Object>();
        dfs = new DataFileStream<Object>(inStream, reader);

        long thisFileRecords = 0;
        while (dfs.hasNext()) {

            numAvroRecords = numAvroRecords + dfs.getBlockCount();
            thisFileRecords = thisFileRecords + dfs.getBlockCount();

            //System.out.println("Input file "+thisFileStatus.getPath()+" getBlockCount() is "+dfs.getBlockCount()+"." );

            dfs.nextBlock();
        }

        System.out.println("Input file " + thisFileStatus.getPath() + " has " + thisFileRecords + " records.");

        dfs.close();
        inStream.close();

        //TODO test on dir with non-avro file and see what the exception is, catch that and log to output but don't die.
    }

    System.out.println("Input dir/file ( " + inDirStr + " ) has " + inputFileList.size() + " files and "
            + numAvroRecords + " total records.");

}

From source file:com.awcoleman.StandaloneJava.AvroCounterByRecord.java

License:Apache License

public AvroCounterByRecord(String inDirStr) throws IOException {

    long numAvroRecords = 0;

    //Get list of input files
    ArrayList<FileStatus> inputFileList = new ArrayList<FileStatus>();

    Configuration conf = new Configuration();
    conf.addResource(new Path("/etc/hadoop/conf/core-site.xml"));
    conf.set("dfs.replication", "1"); //see http://stackoverflow.com/questions/24548699/how-to-append-to-an-hdfs-file-on-an-extremely-small-cluster-3-nodes-or-less

    FileSystem hdfs = null;/*from   ww  w . j a  v  a 2s .  c o m*/
    try {
        hdfs = FileSystem.get(conf);
    } catch (java.io.IOException ioe) {
        System.out.println("Error opening HDFS filesystem. Exiting. Error message: " + ioe.getMessage());
        System.exit(1);
    }
    if (hdfs.getStatus() == null) {
        System.out.println("Unable to contact HDFS filesystem. Exiting.");
        System.exit(1);
    }

    //Check if input dirs/file exists and get file list (even if list of single file)
    Path inPath = new Path(inDirStr);
    if (hdfs.exists(inPath) && hdfs.isFile(inPath)) { //single file
        inputFileList.add(hdfs.getFileStatus(inPath));
    } else if (hdfs.exists(inPath) && hdfs.isDirectory(inPath)) { //dir
        //Get list of input files
        RemoteIterator<LocatedFileStatus> fileStatusListIterator = hdfs.listFiles(inPath, true);
        while (fileStatusListIterator.hasNext()) {
            LocatedFileStatus fileStatus = fileStatusListIterator.next();

            if (fileStatus.isFile() && !fileStatus.getPath().getName().equals("_SUCCESS")) {
                inputFileList.add((FileStatus) fileStatus);
            }
        }
    } else {
        System.out.println("Input directory ( " + inDirStr + " ) not found or is not directory. Exiting.");
        System.exit(1);
    }

    for (FileStatus thisFileStatus : inputFileList) {

        //_SUCCESS files are 0 bytes
        if (thisFileStatus.getLen() == 0) {
            continue;
        }

        DataFileStream<Object> avroStream = null;
        FSDataInputStream inStream = hdfs.open(thisFileStatus.getPath());
        GenericDatumReader<Object> reader = new GenericDatumReader<Object>();
        avroStream = new DataFileStream<Object>(inStream, reader);

        long thisFileRecords = 0;

        while (avroStream.hasNext()) {
            numAvroRecords++;
            thisFileRecords++;
            avroStream.next();
        }
        avroStream.close();
        inStream.close();

        System.out.println("Input file " + thisFileStatus.getPath() + " has " + thisFileRecords + " records.");

        //TODO test on dir with non-avro file and see what the exception is, catch that and log to output but don't die.
    }

    System.out.println("Input dir/file ( " + inDirStr + " ) has " + inputFileList.size() + " files and "
            + numAvroRecords + " total records.");

}

From source file:com.bark.hadoop.lab3.PageRank.java

@Override
public int run(String args[]) {
    String tmp = "/tmp/" + new Date().getTime();
    //        long timeStamp = new Date().getTime();
    try {//ww  w  .  j  av  a2s.c o  m
        /**
         * Job 1: Parse XML input and read title,links
         */
        Configuration conf = new Configuration();
        conf.set("xmlinput.start", "<page>");
        conf.set("xmlinput.end", "</page>");

        Job job = Job.getInstance(conf);
        job.setJarByClass(PageRank.class);

        // specify a mapper
        job.setMapperClass(RedLinkMapper.class);

        // specify a reducer
        job.setReducerClass(RedLinkReducer.class);

        // specify output types
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        // specify input and output DIRECTORIES
        FileInputFormat.addInputPath(job, new Path(args[0]));
        job.setInputFormatClass(XmlInputFormat.class);

        FileOutputFormat.setOutputPath(job, new Path((args[1] + tmp + "/job1")));
        job.setOutputFormatClass(TextOutputFormat.class);

        job.waitForCompletion(true);
    } catch (InterruptedException | ClassNotFoundException | IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error during mapreduce job1.");
        return 2;
    }
    /**
     * Job 2: Adjacency outGraph
     */
    try {
        Configuration conf2 = new Configuration();

        Job job2 = Job.getInstance(conf2);
        job2.setJarByClass(PageRank.class);

        // specify a mapper
        job2.setMapperClass(AdjMapper.class);

        // specify a reducer
        job2.setReducerClass(AdjReducer.class);

        // specify output types
        job2.setOutputKeyClass(Text.class);
        job2.setOutputValueClass(Text.class);

        // specify input and output DIRECTORIES
        FileInputFormat.addInputPath(job2, new Path((args[1] + tmp + "/job1")));
        job2.setInputFormatClass(TextInputFormat.class);

        FileOutputFormat.setOutputPath(job2, new Path((args[1] + tmp + "/job2")));
        job2.setOutputFormatClass(TextOutputFormat.class);

        job2.waitForCompletion(true);
    } catch (InterruptedException | ClassNotFoundException | IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error during mapreduce job2.");
        return 2;
    }
    /**
     * Job 3: PageCount
     */
    try {
        Configuration conf3 = new Configuration();
        /**
         * Change output separator to "=" instead of default \t for this job
         */
        conf3.set("mapreduce.output.textoutputformat.separator", "=");

        Job job3 = Job.getInstance(conf3);
        job3.setJarByClass(PageRank.class);

        // specify a mapper
        job3.setMapperClass(PageCountMapper.class);

        // specify a reducer
        job3.setReducerClass(PageCountReducer.class);

        // specify output types
        job3.setOutputKeyClass(Text.class);
        job3.setOutputValueClass(IntWritable.class);

        // specify input and output DIRECTORIES
        FileInputFormat.addInputPath(job3, new Path((args[1] + tmp + "/job2")));
        job3.setInputFormatClass(TextInputFormat.class);

        FileOutputFormat.setOutputPath(job3, new Path((args[1] + tmp + "/job3")));
        job3.setOutputFormatClass(TextOutputFormat.class);

        job3.waitForCompletion(true);
    } catch (InterruptedException | ClassNotFoundException | IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error during mapreduce job3.");
        return 2;
    }
    /**
     * Job 4: PageRank
     */
    for (int i = 1; i < 9; i++) {
        try {
            Configuration conf4 = new Configuration();
            /**
             * Read number of nodes from the output of job 3 : pageCount
             */
            Path path = new Path((args[1] + tmp + "/job3"));
            FileSystem fs = path.getFileSystem(conf4);
            RemoteIterator<LocatedFileStatus> ri = fs.listFiles(path, true);

            int n = 0;
            Pattern pt = Pattern.compile("(\\d+)");
            while (ri.hasNext()) {
                LocatedFileStatus lfs = ri.next();
                if (lfs.isFile() && n == 0) {
                    FSDataInputStream inputStream = fs.open(lfs.getPath());
                    BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                    String s = null;
                    while ((s = br.readLine()) != null) {
                        Matcher mt = pt.matcher(s);
                        if (mt.find()) {
                            n = new Integer(mt.group(1));
                            break;
                        }
                    }
                }
            }
            /**
             * Done reading number of nodes, make it available to MapReduce
             * job key: N
             */
            conf4.setInt("N", n);

            Job job4 = Job.getInstance(conf4);
            job4.setJarByClass(PageRank.class);

            // specify a mapper
            job4.setMapperClass(PageRankMapper.class);

            // specify a reducer
            job4.setReducerClass(PageRankReducer.class);

            // specify output types
            job4.setOutputKeyClass(Text.class);
            job4.setOutputValueClass(Text.class);

            // specify input and output DIRECTORIES
            if (i == 1) {
                FileInputFormat.addInputPath(job4, new Path((args[1] + tmp + "/job2")));
            } else {
                FileInputFormat.addInputPath(job4, new Path((args[1] + tmp + "/job4/" + (i - 1))));
            }
            job4.setInputFormatClass(TextInputFormat.class);

            FileOutputFormat.setOutputPath(job4, new Path((args[1] + tmp + "/job4/" + i)));
            job4.setOutputFormatClass(TextOutputFormat.class);
            job4.waitForCompletion(true);
        } catch (InterruptedException | ClassNotFoundException | IOException ex) {
            Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
            System.err.println("Error during mapreduce job4.");
            return 2;
        }
    }
    /**
     * Job 5: Sort iteration 1 and iteration 8
     */
    int returnCode = 0;
    for (int i = 0; i < 2; i++) {
        try {
            Configuration conf5 = new Configuration();

            /**
             * Read number of nodes from the output of job 3 : pageCount
             */
            Path path = new Path((args[1] + tmp + "/job3"));
            FileSystem fs = path.getFileSystem(conf5);
            RemoteIterator<LocatedFileStatus> ri = fs.listFiles(path, true);

            int n = 0;
            Pattern pt = Pattern.compile("(\\d+)");
            while (ri.hasNext()) {
                LocatedFileStatus lfs = ri.next();
                if (lfs.isFile() && n == 0) {
                    FSDataInputStream inputStream = fs.open(lfs.getPath());
                    BufferedReader br = new BufferedReader(new InputStreamReader(inputStream));
                    String s = null;
                    while ((s = br.readLine()) != null) {
                        Matcher mt = pt.matcher(s);
                        if (mt.find()) {
                            n = new Integer(mt.group(1));
                            break;
                        }
                    }
                }
            }
            /**
             * Done reading number of nodes, make it available to MapReduce
             * job key: N
             */
            conf5.setInt("N", n);

            Job job5 = Job.getInstance(conf5);
            /**
             * one reducer only
             */
            job5.setNumReduceTasks(1);
            job5.setSortComparatorClass(MyWritableComparator.class);
            job5.setJarByClass(PageRank.class);

            // specify a mapper
            job5.setMapperClass(SortMapper.class);
            job5.setMapOutputKeyClass(DoubleWritable.class);
            job5.setMapOutputValueClass(Text.class);

            // specify a reducer
            job5.setReducerClass(SortReducer.class);

            // specify output types
            job5.setOutputKeyClass(Text.class);
            job5.setOutputValueClass(DoubleWritable.class);

            // specify input and output DIRECTORIES
            int y = 7 * i + 1;
            FileInputFormat.addInputPath(job5, new Path((args[1] + tmp + "/job4/" + y)));
            job5.setInputFormatClass(TextInputFormat.class);

            FileOutputFormat.setOutputPath(job5, new Path((args[1] + tmp + "/job5/" + y)));
            job5.setOutputFormatClass(TextOutputFormat.class);

            returnCode = job5.waitForCompletion(true) ? 0 : 1;
        } catch (InterruptedException | ClassNotFoundException | IOException ex) {
            Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
            System.err.println("Error during mapreduce job5.");
            return 2;
        }
    }
    /**
     * Copy necessary output files to args[1]        /**
     * Copy necessary output files to args[1]
     */

    /**
     * Rename and copy OutLinkGraph
     */
    try {
        Configuration conf = new Configuration();

        Path outLinkGraph = new Path((args[1] + tmp + "/job2/part-r-00000"));
        FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf);

        Path output = new Path(args[1] + "/results/PageRank.outlink.out");
        FileSystem outputFS = output.getFileSystem(conf);
        org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf);
    } catch (IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error while copying results.");
        return 2;
    }

    /**
     * Rename and copy total number of pages
     */
    try {
        Configuration conf = new Configuration();

        Path outLinkGraph = new Path((args[1] + tmp + "/job3/part-r-00000"));
        FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf);

        Path output = new Path(args[1] + "/results/PageRank.n.out");
        FileSystem outputFS = output.getFileSystem(conf);
        org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf);
    } catch (IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error while copying results.");
        return 2;
    }

    /**
     * Rename and copy iteration 1
     */
    try {
        Configuration conf = new Configuration();

        Path outLinkGraph = new Path((args[1] + tmp + "/job5/1/part-r-00000"));
        FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf);

        Path output = new Path(args[1] + "/results/PageRank.iter1.out");
        FileSystem outputFS = output.getFileSystem(conf);
        org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf);
    } catch (IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error while copying results.");
        return 2;
    }

    /**
     * Rename and copy iteration 8
     */
    try {
        Configuration conf = new Configuration();

        Path outLinkGraph = new Path((args[1] + tmp + "/job5/8/part-r-00000"));
        FileSystem outLinkGraphFS = outLinkGraph.getFileSystem(conf);

        Path output = new Path(args[1] + "/results/PageRank.iter8.out");
        FileSystem outputFS = output.getFileSystem(conf);
        org.apache.hadoop.fs.FileUtil.copy(outLinkGraphFS, outLinkGraph, outputFS, output, false, true, conf);
    } catch (IOException ex) {
        Logger.getLogger(PageRank.class.getName()).log(Level.SEVERE, ex.toString(), ex);
        System.err.println("Error while copying results.");
        return 2;
    }
    return returnCode;
}

From source file:io.druid.indexer.updater.HadoopConverterJob.java

License:Apache License

public List<DataSegment> run() throws IOException {
    final JobConf jobConf = new JobConf();
    jobConf.setKeepFailedTaskFiles(false);
    for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) {
        jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()");
    }//w  ww  . j  av a 2s.  c  om
    final List<DataSegment> segments = converterConfig.getSegments();
    if (segments.isEmpty()) {
        throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource());
    }
    converterConfigIntoConfiguration(converterConfig, segments, jobConf);

    jobConf.setNumReduceTasks(0);// Map only. Number of map tasks determined by input format
    jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache()));

    setJobName(jobConf, segments);

    if (converterConfig.getJobPriority() != null) {
        jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority()));
    }

    final Job job = Job.getInstance(jobConf);

    job.setInputFormatClass(ConfigInputFormat.class);
    job.setMapperClass(ConvertingMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapSpeculativeExecution(false);
    job.setOutputFormatClass(ConvertingOutputFormat.class);

    JobHelper.setupClasspath(JobHelper.distributedClassPath(jobConf.getWorkingDirectory()),
            JobHelper.distributedClassPath(getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())),
            job);

    Throwable throwable = null;
    try {
        job.submit();
        log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());
        final boolean success = job.waitForCompletion(true);
        if (!success) {
            final TaskReport[] reports = job.getTaskReports(TaskType.MAP);
            if (reports != null) {
                for (final TaskReport report : reports) {
                    log.error("Error in task [%s] : %s", report.getTaskId(),
                            Arrays.toString(report.getDiagnostics()));
                }
            }
            return null;
        }
        try {
            loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue();
            writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue();
        } catch (IOException ex) {
            log.error(ex, "Could not fetch counters");
        }
        final JobID jobID = job.getJobID();

        final Path jobDir = getJobPath(jobID, job.getWorkingDirectory());
        final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
        final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true);
        final List<Path> goodPaths = new ArrayList<>();
        while (it.hasNext()) {
            final LocatedFileStatus locatedFileStatus = it.next();
            if (locatedFileStatus.isFile()) {
                final Path myPath = locatedFileStatus.getPath();
                if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) {
                    goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY));
                }
            }
        }
        if (goodPaths.isEmpty()) {
            log.warn("No good data found at [%s]", jobDir);
            return null;
        }
        final List<DataSegment> returnList = ImmutableList
                .copyOf(Lists.transform(goodPaths, new Function<Path, DataSegment>() {
                    @Nullable
                    @Override
                    public DataSegment apply(final Path input) {
                        try {
                            if (!fs.exists(input)) {
                                throw new ISE("Somehow [%s] was found but [%s] is missing at [%s]",
                                        ConvertingOutputFormat.DATA_SUCCESS_KEY,
                                        ConvertingOutputFormat.DATA_FILE_KEY, jobDir);
                            }
                        } catch (final IOException e) {
                            throw Throwables.propagate(e);
                        }
                        try (final InputStream stream = fs.open(input)) {
                            return HadoopDruidConverterConfig.jsonMapper.readValue(stream, DataSegment.class);
                        } catch (final IOException e) {
                            throw Throwables.propagate(e);
                        }
                    }
                }));
        if (returnList.size() == segments.size()) {
            return returnList;
        } else {
            throw new ISE(
                    "Tasks reported success but result length did not match! Expected %d found %d at path [%s]",
                    segments.size(), returnList.size(), jobDir);
        }
    } catch (InterruptedException | ClassNotFoundException e) {
        RuntimeException exception = Throwables.propagate(e);
        throwable = exception;
        throw exception;
    } catch (Throwable t) {
        throwable = t;
        throw t;
    } finally {
        try {
            cleanup(job);
        } catch (IOException e) {
            if (throwable != null) {
                throwable.addSuppressed(e);
            } else {
                log.error(e, "Could not clean up job [%s]", job.getJobID());
            }
        }
    }
}

From source file:io.fluo.webindex.data.LoadHdfs.java

License:Apache License

public static void main(String[] args) throws Exception {

    if (args.length != 1) {
        log.error("Usage: LoadHdfs <dataDir>");
        System.exit(1);/*from  www .j a  v  a  2s  . c  o  m*/
    }
    final String dataDir = args[0];
    IndexEnv.validateDataDir(dataDir);

    final String hadoopConfDir = IndexEnv.getHadoopConfDir();
    final int rateLimit = DataConfig.load().getLoadRateLimit();

    List<String> loadPaths = new ArrayList<>();
    FileSystem hdfs = IndexEnv.getHDFS();
    RemoteIterator<LocatedFileStatus> listIter = hdfs.listFiles(new Path(dataDir), true);
    while (listIter.hasNext()) {
        LocatedFileStatus status = listIter.next();
        if (status.isFile()) {
            loadPaths.add(status.getPath().toString());
        }
    }

    log.info("Loading {} files into Fluo from {}", loadPaths.size(), dataDir);

    SparkConf sparkConf = new SparkConf().setAppName("webindex-load-hdfs");
    try (JavaSparkContext ctx = new JavaSparkContext(sparkConf)) {

        JavaRDD<String> paths = ctx.parallelize(loadPaths, loadPaths.size());

        paths.foreachPartition(iter -> {
            final FluoConfiguration fluoConfig = new FluoConfiguration(new File("fluo.properties"));
            final RateLimiter rateLimiter = rateLimit > 0 ? RateLimiter.create(rateLimit) : null;
            FileSystem fs = IndexEnv.getHDFS(hadoopConfDir);
            try (FluoClient client = FluoFactory.newClient(fluoConfig);
                    LoaderExecutor le = client.newLoaderExecutor()) {
                iter.forEachRemaining(path -> {
                    Path filePath = new Path(path);
                    try {
                        if (fs.exists(filePath)) {
                            FSDataInputStream fsin = fs.open(filePath);
                            ArchiveReader reader = WARCReaderFactory.get(filePath.getName(), fsin, true);
                            for (ArchiveRecord record : reader) {
                                Page page = ArchiveUtil.buildPageIgnoreErrors(record);
                                if (page.getOutboundLinks().size() > 0) {
                                    log.info("Loading page {} with {} links", page.getUrl(),
                                            page.getOutboundLinks().size());
                                    if (rateLimiter != null) {
                                        rateLimiter.acquire();
                                    }
                                    le.execute(PageLoader.updatePage(page));
                                }
                            }
                        }
                    } catch (IOException e) {
                        log.error("Exception while processing {}", path, e);
                    }
                });
            }
        });
    }
}

From source file:io.fluo.webindex.data.spark.IndexEnv.java

License:Apache License

public static void validateDataDir(String dataDir) {
    try {/*from  w ww .ja v a2s .c o m*/
        FileSystem hdfs = getHDFS();
        Path dataPath = new Path(dataDir);
        if (!hdfs.exists(dataPath)) {
            log.error("HDFS data directory {} does not exist", dataDir);
            System.exit(-1);
        }
        RemoteIterator<LocatedFileStatus> listIter = hdfs.listFiles(dataPath, true);
        while (listIter.hasNext()) {
            LocatedFileStatus status = listIter.next();
            if (status.isFile()) {
                return;
            }
        }
        log.error("HDFS data directory {} has no files", dataDir);
        System.exit(-1);
    } catch (IOException e) {
        throw new IllegalStateException(e);
    }
}

From source file:org.apache.druid.indexer.updater.HadoopConverterJob.java

License:Apache License

public List<DataSegment> run() throws IOException {
    final JobConf jobConf = new JobConf();
    jobConf.setKeepFailedTaskFiles(false);
    for (Map.Entry<String, String> entry : converterConfig.getHadoopProperties().entrySet()) {
        jobConf.set(entry.getKey(), entry.getValue(), "converterConfig.getHadoopProperties()");
    }/*  ww w .jav a  2 s  .  c  o  m*/
    final List<DataSegment> segments = converterConfig.getSegments();
    if (segments.isEmpty()) {
        throw new IAE("No segments found for datasource [%s]", converterConfig.getDataSource());
    }
    converterConfigIntoConfiguration(converterConfig, segments, jobConf);

    jobConf.setNumReduceTasks(0); // Map only. Number of map tasks determined by input format
    jobConf.setWorkingDirectory(new Path(converterConfig.getDistributedSuccessCache()));

    setJobName(jobConf, segments);

    if (converterConfig.getJobPriority() != null) {
        jobConf.setJobPriority(JobPriority.valueOf(converterConfig.getJobPriority()));
    }

    final Job job = Job.getInstance(jobConf);

    job.setInputFormatClass(ConfigInputFormat.class);
    job.setMapperClass(ConvertingMapper.class);
    job.setMapOutputKeyClass(Text.class);
    job.setMapOutputValueClass(Text.class);
    job.setMapSpeculativeExecution(false);
    job.setOutputFormatClass(ConvertingOutputFormat.class);

    JobHelper.setupClasspath(JobHelper.distributedClassPath(jobConf.getWorkingDirectory()),
            JobHelper.distributedClassPath(getJobClassPathDir(job.getJobName(), jobConf.getWorkingDirectory())),
            job);

    Throwable throwable = null;
    try {
        job.submit();
        log.info("Job %s submitted, status available at %s", job.getJobName(), job.getTrackingURL());
        final boolean success = job.waitForCompletion(true);
        if (!success) {
            final TaskReport[] reports = job.getTaskReports(TaskType.MAP);
            if (reports != null) {
                for (final TaskReport report : reports) {
                    log.error("Error in task [%s] : %s", report.getTaskId(),
                            Arrays.toString(report.getDiagnostics()));
                }
            }
            return null;
        }
        try {
            loadedBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_LOADED).getValue();
            writtenBytes = job.getCounters().findCounter(COUNTER_GROUP, COUNTER_WRITTEN).getValue();
        } catch (IOException ex) {
            log.error(ex, "Could not fetch counters");
        }
        final JobID jobID = job.getJobID();

        final Path jobDir = getJobPath(jobID, job.getWorkingDirectory());
        final FileSystem fs = jobDir.getFileSystem(job.getConfiguration());
        final RemoteIterator<LocatedFileStatus> it = fs.listFiles(jobDir, true);
        final List<Path> goodPaths = new ArrayList<>();
        while (it.hasNext()) {
            final LocatedFileStatus locatedFileStatus = it.next();
            if (locatedFileStatus.isFile()) {
                final Path myPath = locatedFileStatus.getPath();
                if (ConvertingOutputFormat.DATA_SUCCESS_KEY.equals(myPath.getName())) {
                    goodPaths.add(new Path(myPath.getParent(), ConvertingOutputFormat.DATA_FILE_KEY));
                }
            }
        }
        if (goodPaths.isEmpty()) {
            log.warn("No good data found at [%s]", jobDir);
            return null;
        }
        final List<DataSegment> returnList = ImmutableList
                .copyOf(Lists.transform(goodPaths, new Function<Path, DataSegment>() {
                    @Nullable
                    @Override
                    public DataSegment apply(final Path input) {
                        try {
                            if (!fs.exists(input)) {
                                throw new ISE("Somehow [%s] was found but [%s] is missing at [%s]",
                                        ConvertingOutputFormat.DATA_SUCCESS_KEY,
                                        ConvertingOutputFormat.DATA_FILE_KEY, jobDir);
                            }
                        } catch (final IOException e) {
                            throw Throwables.propagate(e);
                        }
                        try (final InputStream stream = fs.open(input)) {
                            return HadoopDruidConverterConfig.jsonMapper.readValue(stream, DataSegment.class);
                        } catch (final IOException e) {
                            throw Throwables.propagate(e);
                        }
                    }
                }));
        if (returnList.size() == segments.size()) {
            return returnList;
        } else {
            throw new ISE(
                    "Tasks reported success but result length did not match! Expected %d found %d at path [%s]",
                    segments.size(), returnList.size(), jobDir);
        }
    } catch (InterruptedException | ClassNotFoundException e) {
        RuntimeException exception = Throwables.propagate(e);
        throwable = exception;
        throw exception;
    } catch (Throwable t) {
        throwable = t;
        throw t;
    } finally {
        try {
            cleanup(job);
        } catch (IOException e) {
            if (throwable != null) {
                throwable.addSuppressed(e);
            } else {
                log.error(e, "Could not clean up job [%s]", job.getJobID());
            }
        }
    }
}