Example usage for org.apache.hadoop.fs Path toUri

List of usage examples for org.apache.hadoop.fs Path toUri

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toUri.

Prototype

public URI toUri() 

Source Link

Document

Convert this Path to a URI.

Usage

From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorage.java

License:Apache License

/**
 * Get avro schema of first input file that matches the location pattern.
 *
 * @param paths  set of input files/*from  w w  w . ja  v a  2s.c  o m*/
 * @param conf  configuration
 * @return avro schema
 * @throws IOException
 */
protected Schema getAvroSchema(Set<Path> paths, Configuration conf) throws IOException {
    if (paths == null || paths.isEmpty()) {
        return null;
    }
    Iterator<Path> iterator = paths.iterator();
    Schema schema = null;
    while (iterator.hasNext()) {
        Path path = iterator.next();
        FileSystem fs = FileSystem.get(path.toUri(), conf);
        schema = getAvroSchema(path, fs);
        if (schema != null) {
            break;
        }
    }
    return schema;
}

From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorage.java

License:Apache License

/**
 * Merge multiple input avro schemas into one. Note that we can't merge arbitrary schemas.
 * Please see AvroStorageUtils.mergeSchema() for what's allowed and what's not allowed.
 *
 * @param basePaths  set of input dir or files
 * @param conf  configuration// ww  w .  j a v a 2s .c  om
 * @return avro schema
 * @throws IOException
 */
protected Schema getMergedSchema(Set<Path> basePaths, Configuration conf) throws IOException {
    Schema result = null;
    Map<Path, Schema> mergedFiles = new HashMap<Path, Schema>();

    Set<Path> paths = AvroStorageUtils.getAllFilesRecursively(basePaths, conf);
    for (Path path : paths) {
        FileSystem fs = FileSystem.get(path.toUri(), conf);
        Schema schema = getSchema(path, fs);
        if (schema != null) {
            result = AvroStorageUtils.mergeSchema(result, schema);
            mergedFiles.put(path, schema);
        }
    }
    // schemaToMergedSchemaMap is only needed when merging multiple records.
    if ((schemaToMergedSchemaMap == null || schemaToMergedSchemaMap.isEmpty()) && mergedFiles.size() > 1
            && result.getType().equals(Schema.Type.RECORD)) {
        schemaToMergedSchemaMap = AvroStorageUtils.getSchemaToMergedSchemaMap(result, mergedFiles);
    }
    return result;
}

From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorage.java

License:Apache License

/**
 * Initialize output avro schema using input property map
 *///ww  w . j ava  2s.  c o  m
protected void init(Map<String, Object> inputs) throws IOException {

    /*used to store field schemas */
    List<Field> fields = null;

    /* set debug level */
    if (inputs.containsKey("debug")) {
        AvroStorageLog.setDebugLevel((Integer) inputs.get("debug"));
    }

    /* initialize schema manager, if any */
    AvroSchemaManager schemaManager = null;
    if (inputs.containsKey("data")) {
        Path path = new Path((String) inputs.get("data"));
        AvroStorageLog.details("data path=" + path.toUri().toString());
        FileSystem fs = FileSystem.get(path.toUri(), new Configuration());
        Schema schema = getAvroSchema(path, fs);
        schemaManager = new AvroSchemaManager(schema);
    } else if (inputs.containsKey("schema_file")) {
        Path path = new Path((String) inputs.get("schema_file"));
        AvroStorageLog.details("schemaFile path=" + path.toUri().toString());
        FileSystem fs = FileSystem.get(path.toUri(), new Configuration());
        Schema schema = getSchemaFromFile(path, fs);
        schemaManager = new AvroSchemaManager(schema);
    }

    /* iterate input property map */
    for (Entry<String, Object> entry : inputs.entrySet()) {
        String name = entry.getKey().trim();
        Object value = entry.getValue();

        if (name.equalsIgnoreCase("index")) {
            /* set index of store function */
            storeFuncIndex = (Integer) value;
        } else if (name.equalsIgnoreCase("same")) {
            /* use schema in the specified path as output schema */
            Path path = new Path(((String) value).trim());
            AvroStorageLog.details("data path=" + path.toUri().toString());
            FileSystem fs = FileSystem.get(path.toUri(), new Configuration());
            outputAvroSchema = getAvroSchema(path, fs);
        } else if (name.equalsIgnoreCase("nullable")) {
            nullable = (Boolean) value;
        } else if (name.equalsIgnoreCase("schema")) {
            outputAvroSchema = Schema.parse((String) value);
            userSpecifiedAvroSchema = outputAvroSchema;
        } else if (name.equalsIgnoreCase("schema_uri")) {
            /* use the contents of the specified path as output schema */
            Path path = new Path(((String) value).trim());
            AvroStorageLog.details("schema_uri path=" + path.toUri().toString());
            FileSystem fs = FileSystem.get(path.toUri(), new Configuration());
            outputAvroSchema = getSchemaFromFile(path, fs);
            userSpecifiedAvroSchema = outputAvroSchema;
        } else if (name.matches("field\\d+")) {
            /*set schema of dth field */
            if (fields == null)
                fields = new ArrayList<Field>();

            int index = Integer.parseInt(name.substring("field".length()));
            String content = ((String) value).trim();
            Field field = null;
            if (content.equalsIgnoreCase(NOTNULL)) {
                /* null means deriving avro schema from pig schema but not null*/
                field = AvroStorageUtils.createUDField(index, null);
            } else if (content.startsWith("def:")) {
                if (schemaManager == null)
                    throw new IOException("Please specify data parameter (using \"data\") before this one.");

                String alias = content.substring("def:".length());
                Schema s = schemaManager.getSchema(alias);
                if (s == null)
                    throw new IOException("Cannot find matching schema for alias:" + alias);
                /* use pre-defined schema*/
                field = AvroStorageUtils.createUDField(index, s);

                AvroStorageLog.details("Use pre-defined schema(" + alias + "): " + s + " for field " + index);
            } else {
                Schema schema = null;
                try {
                    schema = Schema.parse(content);
                } catch (RuntimeException e) {
                    /* might be primary schema like int or long */
                    schema = Schema.parse("\"" + content + "\"");
                }

                field = AvroStorageUtils.createUDField(index, schema);
            }

            fields.add(field);
        } else if (!name.equalsIgnoreCase("data") && !name.equalsIgnoreCase("schema_file")
                && !name.equalsIgnoreCase("debug")) {
            throw new IOException("Invalid parameter:" + name);
        }
    }

    /* if schemas of some fields are set */
    if (fields != null && outputAvroSchema == null) {
        outputAvroSchema = AvroStorageUtils.createUDPartialRecordSchema();
        outputAvroSchema.setFields(fields);
    }

    /* print warning if both output and nullable are specified;
     * and nullable will be ignored.*/
    if (outputAvroSchema != null) {
        if (!nullable) {
            AvroStorageLog.warn("Invalid parameter--nullable cannot be false while "
                    + "output schema is not null. Will ignore nullable.\n\n");
            nullable = true;
        }
    }

}

From source file:com.linkedin.cubert.pig.piggybank.storage.avro.AvroStorageUtils.java

License:Apache License

/**
 * Returns all non-hidden files recursively inside the base paths given
 *
 * @throws IOException//from  w w w. j av  a2s  .co  m
 */
public static Set<Path> getAllFilesRecursively(Set<Path> basePaths, Configuration conf) throws IOException {
    Set<Path> paths = new HashSet<Path>();
    for (Path path : basePaths) {
        FileSystem fs = FileSystem.get(path.toUri(), conf);
        FileStatus f = fs.getFileStatus(path);
        if (f.isDir()) {
            getAllFilesInternal(f, conf, paths, fs);
        } else {
            paths.add(path);
        }
    }
    return paths;
}

From source file:com.linkedin.pinot.hadoop.job.SegmentCreationJob.java

License:Apache License

private void addDepsJarToDistributedCache(Path path, Job job) throws IOException {
    LOGGER.info("Trying to add all the deps jar files from directory: {}", path);
    FileSystem fs = FileSystem.get(getConf());
    FileStatus[] fileStatusArr = fs.listStatus(path);
    for (FileStatus fileStatus : fileStatusArr) {
        if (fileStatus.isDirectory()) {
            addDepsJarToDistributedCache(fileStatus.getPath(), job);
        } else {// ww  w. j a va 2 s  .  c  o  m
            Path depJarPath = fileStatus.getPath();
            if (depJarPath.getName().endsWith(".jar")) {
                LOGGER.info("Adding deps jar files: {}", path);
                job.addCacheArchive(path.toUri());
            }
        }
    }
}

From source file:com.linkedin.pinot.hadoop.job.SegmentUriPushJob.java

License:Apache License

public void pushOneTarFile(FileSystem fs, Path path) throws Exception {
    String fileName = path.getName();
    if (!fileName.endsWith(".tar.gz")) {
        return;/* w  ww .  j a  v  a  2s. c  om*/
    }
    for (String host : _hosts) {
        String uri = String.format("%s%s%s", _pushUriPrefix, path.toUri().getRawPath(), _pushUriSuffix);
        LOGGER.info("******** Upoading file: {} to Host: {} and Port: {} with download uri: {} *******",
                fileName, host, _port, uri);
        try {
            int responseCode = FileUploadUtils.sendSegmentUri(host, _port, uri);
            LOGGER.info("Response code: {}", responseCode);
        } catch (Exception e) {
            LOGGER.error("******** Error Upoading file: {} to Host: {} and Port: {}  *******", fileName, host,
                    _port);
            LOGGER.error("Caught exception during upload", e);
            throw new RuntimeException("Got Error during send tar files to push hosts!");
        }
    }
}

From source file:com.m6d.filecrush.crush.Crush.java

License:Apache License

void writeDirs() throws IOException {

    print(Verbosity.INFO, "\nUsing temporary directory " + tmpDir.toUri().getPath() + "\n");

    FileStatus status = fs.getFileStatus(srcDir);

    Path tmpIn = new Path(tmpDir, "in");

    bucketFiles = new Path(tmpIn, "dirs");
    partitionMap = new Path(tmpIn, "partition-map");
    counters = new Path(tmpIn, "counters");

    skippedFiles = new HashSet<String>();
    removableFiles = new HashSet<String>();

    /*/* ww w.  ja v  a 2s.  c om*/
     * Prefer the path returned by the status because it is always fully qualified.
     */
    List<Path> dirs = asList(status.getPath());

    Text key = new Text();
    Text value = new Text();

    Bucketer partitionBucketer = new Bucketer(maxTasks, 0, false);
    partitionBucketer.reset("partition-map");

    jobCounters = new Counters();
    int fileCount = 0;

    //Path bucketFile = new Path(tmpIn, "dirs_" + fileCount++);
    Writer writer = SequenceFile.createWriter(fs, job, bucketFiles, Text.class, Text.class,
            CompressionType.BLOCK);

    try {
        while (!dirs.isEmpty()) {
            List<Path> nextLevel = new LinkedList<Path>();

            for (Path dir : dirs) {
                String dirPath = dir.toUri().getPath();
                print(Verbosity.INFO, "\n\n[" + dirPath + "]");

                jobCounters.incrCounter(MapperCounter.DIRS_FOUND, 1);

                FileStatus[] contents = fs.listStatus(dir, new PathFilter() {
                    @Override
                    public boolean accept(Path testPath) {
                        if (ignoredFilesMatcher == null)
                            return true;
                        ignoredFilesMatcher.reset(testPath.toUri().getPath());
                        boolean ignores = ignoredFilesMatcher.matches();
                        if (ignores)
                            LOG.info("Ignoring file " + testPath);
                        return !ignores;
                    }

                });

                if (contents == null || contents.length == 0) {
                    print(Verbosity.INFO, "\n  Directory is empty");

                    jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                } else {
                    List<FileStatus> crushables = new ArrayList<FileStatus>(contents.length);
                    Set<String> uncrushedFiles = new HashSet<String>(contents.length);

                    long crushableBytes = 0;

                    /*
                     * Queue sub directories for subsequent inspection and examine the files in this directory.
                     */
                    for (FileStatus content : contents) {
                        Path path = content.getPath();

                        if (content.isDir()) {
                            nextLevel.add(path);
                        } else {
                            String filePath = path.toUri().getPath();
                            boolean skipFile = false;
                            if (skippedFilesMatcher != null) {
                                skippedFilesMatcher.reset(filePath);
                                if (skippedFilesMatcher.matches()) {
                                    skipFile = true;
                                }
                            }

                            boolean changed = uncrushedFiles.add(filePath);
                            assert changed : path.toUri().getPath();
                            long fileLength = content.getLen();

                            if (!skipFile && fileLength <= maxEligibleSize) {
                                if (removeEmptyFiles && fileLength == 0)
                                    removableFiles.add(filePath);
                                else {
                                    crushables.add(content);
                                    crushableBytes += fileLength;
                                }
                            }
                        }
                    }

                    /*
                     * We found a directory with data in it. Make sure we know how to name the crush output file and then increment the
                     * number of files we found.
                     */
                    if (!uncrushedFiles.isEmpty()) {
                        if (-1 == findMatcher(dir)) {
                            throw new IllegalArgumentException(
                                    "Could not find matching regex for directory: " + dir);
                        }

                        jobCounters.incrCounter(MapperCounter.FILES_FOUND, uncrushedFiles.size());
                    }

                    if (0 == crushableBytes) {
                        print(Verbosity.INFO, "\n  Directory has no crushable files");

                        jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                    } else {
                        /*
                         * We found files to consider for crushing.
                         */
                        long nBlocks = crushableBytes / dfsBlockSize;

                        if (nBlocks * dfsBlockSize != crushableBytes) {
                            nBlocks++;
                        }

                        /*
                         * maxFileBlocks will be huge in v1 mode, which will lead to one bucket per directory.
                         */
                        long dirBuckets = nBlocks / maxFileBlocks;
                        if (dirBuckets * maxFileBlocks != nBlocks) {
                            dirBuckets++;
                        }

                        if (dirBuckets > Integer.MAX_VALUE) {
                            throw new AssertionError("Too many buckets: " + dirBuckets);
                        }

                        Bucketer directoryBucketer = new Bucketer((int) dirBuckets, excludeSingleFileDirs);
                        directoryBucketer.reset(getPathPart(dir));

                        for (FileStatus file : crushables) {
                            directoryBucketer.add(new FileStatusHasSize(file));
                        }

                        List<Bucket> crushFiles = directoryBucketer.createBuckets();
                        if (crushFiles.isEmpty()) {
                            jobCounters.incrCounter(MapperCounter.DIRS_SKIPPED, 1);
                            print(Verbosity.INFO, "\n  Directory skipped");
                        } else {
                            nBuckets += crushFiles.size();
                            jobCounters.incrCounter(MapperCounter.DIRS_ELIGIBLE, 1);
                            print(Verbosity.INFO, "\n  Generating " + crushFiles.size() + " output files");

                            /*
                             * Write out the mapping between a bucket and a file.
                             */
                            for (Bucket crushFile : crushFiles) {
                                String bucketId = crushFile.name();

                                List<String> filesInBucket = crushFile.contents();

                                print(Verbosity.INFO,
                                        format("\n  Output %s will include %,d input bytes from %,d files",
                                                bucketId, crushFile.size(), filesInBucket.size()));

                                key.set(bucketId);

                                for (String f : filesInBucket) {
                                    boolean changed = uncrushedFiles.remove(f);
                                    assert changed : f;

                                    pathMatcher.reset(f);
                                    pathMatcher.matches();

                                    value.set(pathMatcher.group(5));

                                    /*
                                     * Write one row per file to maximize the number of mappers
                                     */
                                    writer.append(key, value);

                                    /*
                                     * Print the input file with four leading spaces.
                                     */
                                    print(Verbosity.VERBOSE, "\n    " + f);
                                }

                                jobCounters.incrCounter(MapperCounter.FILES_ELIGIBLE, filesInBucket.size());

                                partitionBucketer.add(crushFile);
                            }
                        }
                    }

                    if (!removableFiles.isEmpty()) {
                        print(Verbosity.INFO, "\n  Marked " + removableFiles.size() + " files for removal");

                        for (String removable : removableFiles) {
                            uncrushedFiles.remove(removable);
                            print(Verbosity.VERBOSE, "\n    " + removable);
                        }

                        jobCounters.incrCounter(MapperCounter.FILES_REMOVED, removableFiles.size());
                    }

                    if (!uncrushedFiles.isEmpty()) {
                        print(Verbosity.INFO, "\n  Skipped " + uncrushedFiles.size() + " files");

                        for (String uncrushed : uncrushedFiles) {
                            print(Verbosity.VERBOSE, "\n    " + uncrushed);
                        }

                        jobCounters.incrCounter(MapperCounter.FILES_SKIPPED, uncrushedFiles.size());
                    }

                    skippedFiles.addAll(uncrushedFiles);
                }
            }

            dirs = nextLevel;
        }
    } finally {
        writer.close();
    }

    /*
     * Now that we have processed all the directories, write the partition map.
     */
    List<Bucket> partitions = partitionBucketer.createBuckets();
    assert partitions.size() <= maxTasks;

    writer = SequenceFile.createWriter(fs, job, partitionMap, Text.class, IntWritable.class);
    IntWritable partNum = new IntWritable();
    int totalReducers = 0;
    for (Bucket partition : partitions) {
        String partitionName = partition.name();

        int p = Integer.parseInt(partitionName.substring(partitionName.lastIndexOf('-') + 1));
        partNum.set(p);

        if (partition.contents().size() > 0)
            totalReducers++;

        for (String bucketId : partition.contents()) {
            key.set(bucketId);
            writer.append(key, partNum);
        }
    }
    writer.close();

    print(Verbosity.INFO, "\n\nNumber of allocated reducers = " + totalReducers);
    job.setInt("mapreduce.job.reduces", totalReducers);

    DataOutputStream countersStream = fs.create(this.counters);
    jobCounters.write(countersStream);
    countersStream.close();
}

From source file:com.m6d.filecrush.crush.integration.CrushMapReduceTest.java

License:Apache License

@Before
@Override/*from  www . j  av a2s .c o m*/
public void setUp() throws Exception {
    super.setUp();

    job = createJobConf();

    job.setBoolean("mapreduce.output.fileoutputformat.compress", true);
    job.set("mapreduce.output.fileoutputformat.compress.type", CompressionType.BLOCK.name());
    job.set("mapreduce.output.fileoutputformat.compress.codec", CustomCompressionCodec.class.getName());

    FileSystem fs = getFileSystem();

    Path homeDirPath = fs.makeQualified(new Path("."));

    homeDir = homeDirPath.toUri().getPath();

    fs.delete(homeDirPath, true);

    defaultCodec = new DefaultCodec();
    defaultCodec.setConf(job);

    customCodec = new CustomCompressionCodec();
    customCodec.setConf(job);
}

From source file:com.marklogic.contentpump.ArchiveInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
    List<InputSplit> splits = super.getSplits(job);
    Iterator<InputSplit> iter = splits.iterator();
    while (iter.hasNext()) {
        InputSplit s = iter.next();/*www . j  a v  a 2 s  . com*/
        Path file = ((FileSplit) s).getPath();
        String zipfile = file.toUri().getPath();
        if (LOG.isDebugEnabled()) {
            LOG.debug("Zip file name: " + zipfile);
        }
        int index = file.toUri().getPath().lastIndexOf(EXTENSION);
        if (index == -1) {
            throw new IOException("Archive file should have suffix .zip");
        }
        String subStr = file.toUri().getPath().substring(0, index);
        index = subStr.lastIndexOf('-');
        if (index == -1) {
            throw new IOException("Not type information in Archive name");
        }
        String typeStr = subStr.substring(index + 1, subStr.length());
        try {
            ContentType.valueOf(typeStr);
        } catch (IllegalArgumentException ex) {
            LOG.error("Not a valid archive: " + zipfile);
            iter.remove();
        }
    }
    return splits;
}

From source file:com.marklogic.contentpump.DocumentPathFilter.java

License:Apache License

@Override
public void setConf(Configuration conf) {
    this.conf = conf;
    pattern = conf.get(ConfigConstants.CONF_INPUT_FILE_PATTERN, ".*");
    String inPath = conf.get(ConfigConstants.CONF_INPUT_DIRECTORY);
    if (LOG.isDebugEnabled()) {
        LOG.debug(ConfigConstants.CONF_INPUT_DIRECTORY + ": " + inPath);
    }/*from   w ww  .ja v  a  2  s. co m*/
    Path path = new Path(inPath);
    try {
        fs = FileSystem.get(path.toUri(), conf);
    } catch (IOException e) {
        LOG.error("Please check path: " + inPath, e);
    }
}