Example usage for org.apache.hadoop.fs Path toUri

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toUri.

Prototype

public URI toUri()

Source Link

Document

Convert this Path to a URI.

Usage

From source file:com.twitter.algebra.TransposeJob.java

License:Apache License

/**
 * Perform transpose of A, where A is already wrapped in a DistributedRowMatrix
 * object. /*from   w  w  w .  jav a 2 s. co m*/
 * 
 * @param distM
 *          input matrix A
 * @param conf
 *          the initial configuration
 * @param label
 *          the label for the output directory
 * @return At wrapped in a DistributedRowMatrix object
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public static DistributedRowMatrix transpose(DistributedRowMatrix distM, Configuration conf, String label)
        throws IOException, InterruptedException, ClassNotFoundException {
    Path outputPath = new Path(distM.getOutputTempPath(), label);
    FileSystem fs = FileSystem.get(outputPath.toUri(), conf);
    TransposeJob job = new TransposeJob();
    if (!fs.exists(outputPath)) {
        job.run(conf, distM.getRowPath(), outputPath, distM.numRows(), distM.numCols());
    } else {
        log.warn("----------- Skip already exists: " + outputPath);
    }
    DistributedRowMatrix m = new DistributedRowMatrix(outputPath, distM.getOutputTempPath(), distM.numCols(),
            distM.numRows());
    m.setConf(conf);
    return m;
}

From source file:com.twitter.algebra.TransposeJob.java

License:Apache License

/**
 * Perform transpose of A, where A refers to the path that contains a matrix
 * in {@link SequenceFileInputFormat}.//from   ww  w  .  j  a va  2  s.c  o  m
 * 
 * @param conf
 *          the initial configuration
 * @param matrixInputPath
 *          the path to the input files that we process
 * @param matrixOutputPath
 *          the path of the resulting transpose matrix
 * @param numInputRows
 *          rows
 * @param numInputCols
 *          cols
 * @return the running job
 * @throws IOException
 * @throws InterruptedException
 * @throws ClassNotFoundException
 */
public void run(Configuration conf, Path matrixInputPath, Path matrixOutputPath, int numInputRows,
        int numInputCols) throws IOException, InterruptedException, ClassNotFoundException {
    conf.setInt(NUM_ORIG_ROWS_KEY, numInputRows);
    conf.setInt(RowPartitioner.TOTAL_KEYS, numInputCols);
    FileSystem fs = FileSystem.get(matrixInputPath.toUri(), conf);
    NMFCommon.setNumberOfMapSlots(conf, fs, matrixInputPath, "transpose");

    @SuppressWarnings("deprecation")
    Job job = new Job(conf);
    job.setJarByClass(TransposeJob.class);
    job.setJobName(TransposeJob.class.getSimpleName());

    matrixInputPath = fs.makeQualified(matrixInputPath);
    matrixOutputPath = fs.makeQualified(matrixOutputPath);

    FileInputFormat.addInputPath(job, matrixInputPath);
    job.setInputFormatClass(SequenceFileInputFormat.class);
    FileOutputFormat.setOutputPath(job, matrixOutputPath);
    job.setMapperClass(TransposeMapper.class);
    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(VectorWritable.class);

    int numReducers = NMFCommon.getNumberOfReduceSlots(conf, "transpose");
    job.setNumReduceTasks(numReducers);
    //    job.setPartitionerClass(RowPartitioner.IntRowPartitioner.class);
    RowPartitioner.setPartitioner(job, RowPartitioner.IntRowPartitioner.class, numInputCols);
    job.setCombinerClass(MergeVectorsCombiner.class);
    job.setReducerClass(MergeVectorsReducer.class);
    job.setOutputFormatClass(MatrixOutputFormat.class);
    job.setOutputKeyClass(IntWritable.class);
    job.setOutputValueClass(VectorWritable.class);
    job.submit();
    boolean res = job.waitForCompletion(true);
    if (!res)
        throw new IOException("Job failed!");
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * @param context//w  ww .  j  a v  a2 s .com
 * @param file
 *          an input file to work on provided to the job
 * @return true if there is a index file for the input file
 * @throws IOException
 */
public static boolean foundIndexFile(JobContext context, Path file) throws IOException {

    Configuration conf = context.getConfiguration();
    FileSystem fs = file.getFileSystem(conf);
    Path indexFilePath = new Path(getIndexDir(context) + file.toUri().getRawPath() + "/"
            + BlockIndexedFileInputFormat.INDEXMETAFILENAME);
    if (!fs.exists(indexFilePath)) {
        LOG.info("no index file found for input file:" + file + " at location " + indexFilePath);
        return false;
    }
    FSDataInputStream in = fs.open(indexFilePath);

    ThriftWritable<FileIndexDescriptor> writable = ThriftWritable.newInstance(FileIndexDescriptor.class);
    writable.readFields(in);
    FileIndexDescriptor indexDescriptor = writable.get();
    in.close();
    return verifyInputFileCheckSum(indexDescriptor, context);
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

private List<LongPairWritable> getFilterQualifiedBlocks(JobContext context, Path file,
        BinaryExpression filterCondition, long splitMaxSize) throws IOException {

    Expression lhs = filterCondition.getLhs();
    Expression rhs = filterCondition.getRhs();

    if (filterCondition.getOpType() == OpType.OP_EQ) { // "leaf node"
        // handle cases like 'abcd' == column , column == 'abcd'
        if (rhs instanceof Column && lhs instanceof Const) {
            lhs = filterCondition.getRhs();
            rhs = filterCondition.getLhs();
        }/*from w w  w .j ava  2s .c o  m*/
        String columnName = ((Column) lhs).getName();
        String value = ((String) ((Const) rhs).getValue());
        Text searchedValue = new Text(value);

        FileStatus[] dirlist = listIndexFiles(context, file, columnName);
        int part_num = dirlist.length;
        int part_seqnum = (new HashPartitioner<Text, Text>()).getPartition(searchedValue, searchedValue,
                part_num);
        String part_name = "/part-r-" + String.format("%05d", part_seqnum);
        FileSystem fs = file.getFileSystem(context.getConfiguration());
        MapFile.Reader mapFileIndexReader = new MapFile.Reader(fs,
                getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName + part_name,
                context.getConfiguration());
        ListLongPair indexedBlocks = new ListLongPair();
        mapFileIndexReader.get(searchedValue, indexedBlocks);
        mapFileIndexReader.close();
        return indexedBlocks.get();
    }

    List<LongPairWritable> blocksLeft = getFilterQualifiedBlocks(context, file, (BinaryExpression) lhs,
            splitMaxSize);
    List<LongPairWritable> blocksRight = getFilterQualifiedBlocks(context, file, (BinaryExpression) rhs,
            splitMaxSize);

    if (filterCondition.getOpType() == OpType.OP_AND)
        return andFilter(blocksLeft, blocksRight);
    else if (filterCondition.getOpType() == OpType.OP_OR) {
        return orFilter(blocksLeft, blocksRight, splitMaxSize);
    } else
        throw new IOException("not supported filter condition:" + filterCondition);
}

From source file:com.twitter.elephanttwin.retrieval.BlockIndexedFileInputFormat.java

License:Apache License

/**
 * @param context/* ww  w .  j a v  a  2 s  .  c o  m*/
 * @param file
 *          the input file provided to the job to work on
 * @param columnName
 * @return the list of index files if there is an index directory created for
 *         the input file
 * @throws IOException
 */
protected static FileStatus[] listIndexFiles(JobContext context, Path file, String columnName)
        throws IOException {

    Path indexFilePath = new Path(getIndexDir(context) + file.toUri().getRawPath() + "/" + columnName);

    FileSystem fs = file.getFileSystem(context.getConfiguration());
    FileStatus[] dirlist = fs.listStatus(indexFilePath, indexFileFilter);
    return dirlist;
}

From source file:com.twitter.elephanttwin.util.HdfsUtils.java

License:Apache License

/**
 * Read UTF-8 lines of text directly from HDFS.
 *//*ww w  .ja  v  a  2 s  .c om*/
public static Collection<String> readLines(FileSystem fs, Path path) throws IOException {
    Preconditions.checkNotNull(fs);
    Preconditions.checkNotNull(path);
    LOG.info("Reading from " + path.toUri());
    List<String> lines = Lists.newArrayList();
    try {
        if (!fs.exists(path)) {
            throw new IOException("File not found at " + path);
        }

        // TODO(Jimmy Lin): return CharStreams.readLines(new InputStreamReader(fs.open(path), "UTF-8"))
        // Note that this basically dups the functionality of HdfsFileTransfer.
        BufferedReader stream = new BufferedReader(new InputStreamReader(fs.open(path), "UTF-8"));
        String line;
        while ((line = stream.readLine()) != null) {
            lines.add(line);
        }
        LOG.info("Read " + lines.size() + " queries from " + path.toUri());
        return lines;
    } catch (IOException e) {
        LOG.warning("Failed to read " + path.toUri() + ": " + e.toString());
        throw e;
    }
}

From source file:com.twitter.hraven.etl.FileLister.java

License:Apache License

/**
 * looks at the src path and fetches the list of files to process
 * confirms that the size of files//from ww  w.ja va  2s.  c o m
 * is less than the maxFileSize
 * hbase cell can't store files bigger than maxFileSize,
 * hence no need to consider them for rawloading
 * Reference: {@link https://github.com/twitter/hraven/issues/59}
 * @param maxFileSize - max #bytes to be stored in an hbase cell
 * @param recurse - whether to recurse or not
 * @param hdfs - filesystem to be looked at
 * @param inputPath - root dir of the path containing history files
 * @param jobFileModifiedRangePathFilter - to filter out files
 * @return - array of FileStatus of files to be processed
 * @throws IOException
 */
public static FileStatus[] getListFilesToProcess(long maxFileSize, boolean recurse, FileSystem hdfs,
        Path inputPath, JobFileModifiedRangePathFilter pathFilter) throws IOException {

    LOG.info(" in getListFilesToProcess maxFileSize=" + maxFileSize + " inputPath= " + inputPath.toUri());
    FileStatus[] origList = listFiles(recurse, hdfs, inputPath, pathFilter);
    if (origList == null) {
        LOG.info(" No files found, orig list returning 0");
        return new FileStatus[0];
    }
    return pruneFileListBySize(maxFileSize, origList, hdfs, inputPath);
}

From source file:com.twitter.hraven.etl.FileLister.java

License:Apache License

/**
 * prunes the given list/array of files based on their sizes
 *
 * @param maxFileSize -max #bytes to be stored in an hbase cell
 * @param origList - input list of files to be processed
 * @param hdfs - filesystem to be looked at
 * @param inputPath - root dir of the path containing history files
 * @return - pruned array of FileStatus of files to be processed
 *///from   w w w  .  j  ava2s .co  m
static FileStatus[] pruneFileListBySize(long maxFileSize, FileStatus[] origList, FileSystem hdfs,
        Path inputPath) {
    LOG.info("Pruning orig list  of size " + origList.length + " for source" + inputPath.toUri());

    long fileSize = 0L;
    List<FileStatus> prunedFileList = new ArrayList<FileStatus>();

    Set<String> toBeRemovedJobId = new HashSet<String>();
    for (int i = 0; i < origList.length; i++) {
        fileSize = origList[i].getLen();

        // check if hbase can store this file if yes, consider it for processing
        if (fileSize <= maxFileSize) {
            prunedFileList.add(origList[i]);
        } else {
            Path hugeFile = origList[i].getPath();
            LOG.info("In getListFilesToProcess filesize " + fileSize + " has exceeded maxFileSize "
                    + maxFileSize + " for " + hugeFile.toUri());

            // note the job id so that we can remove the other file (job conf or job history)
            toBeRemovedJobId.add(getJobIdFromPath(hugeFile));
        }
    }
    if (prunedFileList.size() == 0) {
        LOG.info("Found no files worth processing. Returning 0 sized array");
        return new FileStatus[0];
    }

    String jobId = null;
    ListIterator<FileStatus> it = prunedFileList.listIterator();
    while (it.hasNext()) {
        if (toBeRemovedJobId.size() == 0) {
            // no files to remove
            break;
        }
        Path curFile = it.next().getPath();
        jobId = getJobIdFromPath(curFile);
        if (toBeRemovedJobId.contains(jobId)) {
            LOG.info("Removing from prunedList " + curFile.toUri());
            it.remove();
            /*
             * removing the job id from the hash set since there would be only
             * one file with this job id in the prunedList, the other file with
             * this job id was huge and was already moved out
             */
            toBeRemovedJobId.remove(jobId);
        }
    }
    return prunedFileList.toArray(new FileStatus[prunedFileList.size()]);
}

From source file:com.twitter.hraven.etl.FileLister.java

License:Apache License

/**
 * extracts the job id from a Path//from   w  w  w. j  a  v a  2  s . c  o  m
 * @param input Path
 * @return job id as string
 */
static String getJobIdFromPath(Path aPath) {
    String fileName = aPath.getName();
    JobFile jf = new JobFile(fileName);
    String jobId = jf.getJobid();
    if (jobId == null) {
        throw new ProcessingException("job id is null for " + aPath.toUri());
    }
    return jobId;
}

From source file:com.twitter.hraven.etl.JobFilePartitioner.java

License:Apache License

@Override
public int run(String[] args) throws Exception {

    myConf = getConf();// ww  w  . j  ava 2s  .  c o m

    // Presume this is all HDFS paths, even when access as file://
    hdfs = FileSystem.get(myConf);

    // Grab input args and allow for -Dxyz style arguments
    String[] otherArgs = new GenericOptionsParser(myConf, args).getRemainingArgs();

    // Grab the arguments we're looking for.
    CommandLine commandLine = parseArgs(otherArgs);

    // Grab the input path argument
    input = commandLine.getOptionValue("i");
    LOG.info("input=" + input);

    // Grab the input path argument
    String output = commandLine.getOptionValue("o");
    LOG.info("output=" + output);

    skipExisting = commandLine.hasOption("s");
    LOG.info("skipExisting=" + skipExisting);

    moveFiles = commandLine.hasOption("m");
    LOG.info("moveFiles=" + moveFiles);

    if (skipExisting && moveFiles) {
        throw new IllegalArgumentException("Cannot use both options skipExisting and move simultaneously.");
    }

    if (commandLine.hasOption("x")) {
        try {
            maXretention = Integer.parseInt(commandLine.getOptionValue("x"));
        } catch (NumberFormatException nfe) {
            throw new IllegalArgumentException(
                    "maXretention option -x is is not a valid number: " + commandLine.getOptionValue("x"), nfe);
        }
        // Additional check
        if (maXretention < 0) {
            throw new IllegalArgumentException(
                    "Cannot retain less than 0 files. Specified maXretention option -x is: "
                            + commandLine.getOptionValue("x"));
        }
        LOG.info("maXretention=" + maXretention);
        if (moveFiles) {
            throw new IllegalArgumentException("Cannot use both options maXretention and move simultaneously.");
        }
    } else {
        maXretention = Integer.MAX_VALUE;
    }

    outputPath = new Path(output);
    FileStatus outputFileStatus = hdfs.getFileStatus(outputPath);

    if (!outputFileStatus.isDir()) {
        throw new IOException("Output is not a directory" + outputFileStatus.getPath().getName());
    }

    Path inputPath = new Path(input);
    URI inputURI = inputPath.toUri();
    String inputScheme = inputURI.getScheme();

    LOG.info("input scheme is: " + inputScheme);

    // If input directory is HDFS, then process as such. Assume not scheme is
    // HDFS
    if ((inputScheme == null) || (hdfs.getUri().getScheme().equals(inputScheme))) {
        processHDFSSources(inputPath);
    } else if (inputScheme.equals("file")) {
        if (moveFiles) {
            throw new IllegalArgumentException(
                    "Cannot move files that are not already in hdfs. Input is not HDFS: " + input);
        }
        processPlainFileSources(inputURI);
    } else {
        throw new IllegalArgumentException("Cannot process files from this URI scheme: " + inputScheme);
    }

    Statistics statistics = FileSystem.getStatistics(outputPath.toUri().getScheme(), hdfs.getClass());
    if (statistics != null) {
        LOG.info("HDFS bytes read: " + statistics.getBytesRead());
        LOG.info("HDFS bytes written: " + statistics.getBytesWritten());
        LOG.info("HDFS read ops: " + statistics.getReadOps());
        System.out.println("HDFS large read ops: " + statistics.getLargeReadOps());
        LOG.info("HDFS write ops: " + statistics.getWriteOps());
    }

    return 0;
}