Example usage for org.apache.hadoop.fs FileSystem makeQualified

List of usage examples for org.apache.hadoop.fs FileSystem makeQualified

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem makeQualified.

Prototype

public Path makeQualified(Path path) 

Source Link

Document

Qualify a path to one which uses this FileSystem and, if relative, made absolute.

Usage

From source file:com.cloudera.cdk.data.filesystem.FileSystemDatasetRepository.java

License:Apache License

/**
 * Get a {@link com.cloudera.cdk.data.PartitionKey} corresponding to a partition's filesystem path
 * represented as a {@link URI}. If the path is not a valid partition,
 * then {@link IllegalArgumentException} is thrown. Note that the partition does not
 * have to exist./*from   w w  w.  jav a2  s. c  o  m*/
 * @param dataset the filesystem dataset
 * @param partitionPath a directory path where the partition data is stored
 * @return a partition key representing the partition at the given path
 * @since 0.4.0
 */
@SuppressWarnings("deprecation")
public static PartitionKey partitionKeyForPath(Dataset dataset, URI partitionPath) {
    Preconditions.checkState(dataset.getDescriptor().isPartitioned(),
            "Attempt to get a partition on a non-partitioned dataset (name:%s)", dataset.getName());

    Preconditions.checkArgument(dataset instanceof FileSystemDataset, "Dataset is not a FileSystemDataset");
    FileSystemDataset fsDataset = (FileSystemDataset) dataset;

    FileSystem fs = fsDataset.getFileSystem();
    URI partitionUri = fs.makeQualified(new Path(partitionPath)).toUri();
    URI directoryUri = fsDataset.getDirectory().toUri();
    URI relativizedUri = directoryUri.relativize(partitionUri);

    if (relativizedUri.equals(partitionUri)) {
        throw new IllegalArgumentException(
                String.format("Partition URI %s has different " + "root directory to dataset (directory: %s).",
                        partitionUri, directoryUri));
    }

    Iterable<String> parts = Splitter.on('/').split(relativizedUri.getPath());

    PartitionStrategy partitionStrategy = dataset.getDescriptor().getPartitionStrategy();
    List<FieldPartitioner> fieldPartitioners = partitionStrategy.getFieldPartitioners();
    if (Iterables.size(parts) > fieldPartitioners.size()) {
        throw new IllegalArgumentException(
                String.format("Too many partition directories " + "for %s (%s), expecting %s.", partitionUri,
                        Iterables.size(parts), fieldPartitioners.size()));
    }

    List<Object> values = Lists.newArrayList();
    int i = 0;
    for (String part : parts) {
        Iterator<String> split = Splitter.on('=').split(part).iterator();
        String fieldName = split.next();
        FieldPartitioner fp = fieldPartitioners.get(i++);
        if (!fieldName.equals(fp.getName())) {
            throw new IllegalArgumentException(
                    String.format("Unrecognized partition name " + "'%s' in partition %s, expecting '%s'.",
                            fieldName, partitionUri, fp.getName()));
        }
        if (!split.hasNext()) {
            throw new IllegalArgumentException(String
                    .format("Missing partition value for " + "'%s' in partition %s.", fieldName, partitionUri));
        }
        String stringValue = split.next();
        Object value = fp.valueFromString(stringValue);
        values.add(value);
    }
    return com.cloudera.cdk.data.impl.Accessor.getDefault()
            .newPartitionKey(values.toArray(new Object[values.size()]));
}

From source file:com.cloudera.cdk.data.hcatalog.HiveUtils.java

License:Apache License

static DatasetDescriptor descriptorForTable(Configuration conf, Table table) {
    final DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder();

    final String serializationLib = table.getSerializationLib();
    if (SERDE_TO_FORMAT.containsKey(serializationLib)) {
        builder.format(SERDE_TO_FORMAT.get(serializationLib));
    } else {//ww w . ja va  2  s  .co  m
        // TODO: should this use an "unknown" format? others fail in open()
        throw new UnknownFormatException("Unknown format for serde:" + serializationLib);
    }

    final Path dataLocation = new Path(table.getDataLocation());
    final FileSystem fs = fsForPath(conf, dataLocation);

    builder.location(fs.makeQualified(dataLocation));

    // custom properties
    String namesProperty = table.getProperty(CUSTOM_PROPERTIES_PROPERTY_NAME);
    if (namesProperty != null) {
        for (String property : NAME_SPLITTER.split(namesProperty)) {
            builder.property(property, table.getProperty(property));
        }
    }

    if (table.getProperty(PARTITION_EXPRESSION_PROPERTY_NAME) != null) {
        builder.partitionStrategy(
                Accessor.getDefault().fromExpression(table.getProperty(PARTITION_EXPRESSION_PROPERTY_NAME)));
    }

    String schemaUrlString = table.getProperty(AVRO_SCHEMA_URL_PROPERTY_NAME);
    if (schemaUrlString != null) {
        try {
            builder.schemaUri(new URI(schemaUrlString));
        } catch (IOException e) {
            throw new MetadataProviderException("Could not read schema", e);
        } catch (URISyntaxException e) {
            // this library sets the URI, so it should always be valid
            throw new MetadataProviderException("[BUG] Invalid schema URI", e);
        }
    }

    String schemaLiteral = table.getProperty(AVRO_SCHEMA_LITERAL_PROPERTY_NAME);
    if (schemaLiteral != null) {
        builder.schemaLiteral(schemaLiteral);
    }

    try {
        return builder.build();
    } catch (IllegalStateException ex) {
        throw new MetadataProviderException("Cannot find schema: missing metadata");
    }
}

From source file:com.cloudera.cdk.data.TestDatasetDescriptor.java

License:Apache License

@Test
public void testSchemaFromHdfs() throws IOException {
    FileSystem fs = getDFS();

    // copy a schema to HDFS
    Path schemaPath = fs.makeQualified(new Path("schema.avsc"));
    FSDataOutputStream out = fs.create(schemaPath);
    IOUtils.copyBytes(USER_SCHEMA_URL.toURL().openStream(), out, fs.getConf());
    out.close();/*w ww.  j  a  v  a  2  s. c  o  m*/

    // build a schema using the HDFS path and check it's the same
    Schema schema = new DatasetDescriptor.Builder().schemaUri(schemaPath.toUri()).build().getSchema();

    Assert.assertEquals(USER_SCHEMA, schema);
}

From source file:com.cloudera.impala.common.FileSystemUtil.java

License:Apache License

/**
 * Return true iff the path is on the given filesystem.
 *//*from  w ww.  java 2  s. c  o m*/
public static Boolean isPathOnFileSystem(Path path, FileSystem fs) {
    try {
        // Call makeQualified() for the side-effect of FileSystem.checkPath() which will
        // throw an exception if path is not on fs.
        fs.makeQualified(path);
        return true;
    } catch (IllegalArgumentException e) {
        // Path is not on fs.
        return false;
    }
}

From source file:com.cloudera.kitten.lua.AsapLuaContainerLaunchParameters.java

License:Open Source License

private void addOperatorInputs(Map<String, LocalResource> localResources) throws IOException {
    LOG.info("Inputs: " + operator.getInputFiles());
    FileSystem fs = FileSystem.get(conf);
    for (Entry<String, String> e : operator.getInputFiles().entrySet()) {
        if ((!e.getValue().startsWith("hdfs://")) && (!e.getValue().startsWith("$HDFS"))) {
            LOG.info("adding local resource: " + e);
            String inDir = dir;//from ww  w .j a va 2s .  c om
            LocalResource rsrc = Records.newRecord(LocalResource.class);
            rsrc.setType(LocalResourceType.FILE);
            rsrc.setVisibility(LocalResourceVisibility.APPLICATION);
            LOG.info("Adding input: " + inDir + "/" + e.getValue());
            Path dst = new Path(inDir + "/" + e.getValue());
            dst = fs.makeQualified(dst);
            FileStatus stat = fs.getFileStatus(dst);
            rsrc.setSize(stat.getLen());
            rsrc.setTimestamp(stat.getModificationTime());
            rsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst));
            localResources.put(e.getKey(), rsrc);
        }
    }
    /*for(String in : operator.getArguments().split(" ")){
       LOG.info("Adding input: "+in);
       LocalResource nl = constructScriptResource();
       localResources.put(in, nl);
    }*/
}

From source file:com.cloudera.kitten.lua.AsapLuaContainerLaunchParameters.java

License:Open Source License

private void configureLocalScriptResourceForPath(LocalResource rsrc, Path path) throws IOException {
    //System.out.println("URI: "+path.toUri());
    FileSystem fs = FileSystem.get(conf);

    Path dst = new Path(dir + "/" + path.getName());
    fs.moveFromLocalFile(path, dst);/*  w w  w .j  a  v  a2 s. co  m*/
    dst = fs.makeQualified(dst);

    FileStatus stat = fs.getFileStatus(dst);
    rsrc.setSize(stat.getLen());
    rsrc.setTimestamp(stat.getModificationTime());
    rsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst));
}

From source file:com.cloudera.oryx.lambda.batch.BatchUpdateFunction.java

License:Open Source License

/**
 * @return paths from {@link FileStatus}es into one comma-separated String
 * @see FileInputFormat#addInputPath(org.apache.hadoop.mapreduce.Job, Path)
 *//* ww w  .j  ava2 s. c o m*/
private static String joinFSPaths(FileSystem fs, FileStatus[] statuses) {
    StringBuilder joined = new StringBuilder();
    for (FileStatus status : statuses) {
        if (joined.length() > 0) {
            joined.append(',');
        }
        Path path = fs.makeQualified(status.getPath());
        joined.append(StringUtils.escapeString(path.toString()));
    }
    return joined.toString();
}

From source file:com.cloudera.oryx.ml.MLUpdate.java

License:Open Source License

@Override
public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<Object, M> newKeyMessageData,
        JavaPairRDD<Object, M> pastKeyMessageData, String modelDirString,
        TopicProducer<String, String> modelUpdateTopic) throws IOException, InterruptedException {

    Objects.requireNonNull(newKeyMessageData);

    JavaRDD<M> newData = newKeyMessageData.values();
    JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values();

    if (newData != null) {
        newData.cache();//from ww w .j  av a2 s.  co  m
        // This forces caching of the RDD. This shouldn't be necessary but we see some freezes
        // when many workers try to materialize the RDDs at once. Hence the workaround.
        newData.foreachPartition(p -> {
        });
    }
    if (pastData != null) {
        pastData.cache();
        pastData.foreachPartition(p -> {
        });
    }

    List<HyperParamValues<?>> hyperParamValues = getHyperParameterValues();
    int valuesPerHyperParam = HyperParams.chooseValuesPerHyperParam(hyperParamValues.size(), candidates);
    List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos(hyperParamValues, candidates,
            valuesPerHyperParam);

    Path modelDir = new Path(modelDirString);
    Path tempModelPath = new Path(modelDir, ".temporary");
    Path candidatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis()));

    FileSystem fs = FileSystem.get(modelDir.toUri(), sparkContext.hadoopConfiguration());
    fs.mkdirs(candidatesPath);

    Path bestCandidatePath = findBestCandidatePath(sparkContext, newData, pastData, hyperParameterCombos,
            candidatesPath);

    Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis()));
    if (bestCandidatePath == null) {
        log.info("Unable to build any model");
    } else {
        // Move best model into place
        fs.rename(bestCandidatePath, finalPath);
    }
    // Then delete everything else
    fs.delete(candidatesPath, true);

    if (modelUpdateTopic == null) {
        log.info("No update topic configured, not publishing models to a topic");
    } else {
        // Push PMML model onto update topic, if it exists
        Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME);
        if (fs.exists(bestModelPath)) {
            FileStatus bestModelPathFS = fs.getFileStatus(bestModelPath);
            PMML bestModel = null;
            boolean modelNeededForUpdates = canPublishAdditionalModelData();
            boolean modelNotTooLarge = bestModelPathFS.getLen() <= maxMessageSize;
            if (modelNeededForUpdates || modelNotTooLarge) {
                // Either the model is required for publishAdditionalModelData, or required because it's going to
                // be serialized to Kafka
                try (InputStream in = fs.open(bestModelPath)) {
                    bestModel = PMMLUtils.read(in);
                }
            }

            if (modelNotTooLarge) {
                modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel));
            } else {
                modelUpdateTopic.send("MODEL-REF", fs.makeQualified(bestModelPath).toString());
            }

            if (modelNeededForUpdates) {
                publishAdditionalModelData(sparkContext, bestModel, newData, pastData, finalPath,
                        modelUpdateTopic);
            }
        }
    }

    if (newData != null) {
        newData.unpersist();
    }
    if (pastData != null) {
        pastData.unpersist();
    }
}

From source file:com.cloudera.recordbreaker.analyzer.FSCrawler.java

License:Open Source License

/**
 * <code>getStartNonblockingCrawl</code> traverses a given filesystem.  It returns immediately
 * and does not wait for the crawl to complete.
 * If the crawl is created or is already ongoing, it returns true.
 * If the crawl is not currently going and cannot start, it returns false. 
 *///w  w w  .j  av a2  s  .c o  m
public synchronized boolean getStartNonblockingCrawl(final URI fsURI) {
    try {
        final int subdirDepth = INFINITE_CRAWL_DEPTH;
        long fsId = analyzer.getCreateFilesystem(fsURI, true);
        if (fsId < 0) {
            return false;
        }
        LOG.info("Grabbing filesystem: " + fsURI);
        final FileSystem fs = FileSystem.get(fsURI, new Configuration());
        final Path startDir = fs.makeQualified(new Path(fsURI.getPath()));

        final long crawlid = analyzer.getCreatePendingCrawl(fsId, true);
        Thread pendingThread = pendingCrawls.get(crawlid);
        if (pendingThread == null) {
            Thread t = new Thread() {
                public void run() {
                    try {
                        synchronized (pendingCrawls) {
                            pendingCrawls.put(crawlid, this);
                        }
                        synchronized (crawlStatusInfo) {
                            crawlStatusInfo.put(crawlid, new CrawlRuntimeStatus("Initializing crawl"));
                        }
                        // Build the file and dir-level todo lists
                        List<Path> todoFileList = new ArrayList<Path>();
                        List<Path> todoDirList = new ArrayList<Path>();
                        recursiveCrawlBuildList(fs, startDir, subdirDepth, crawlid, todoFileList, todoDirList);

                        // Get the files to process
                        TreeSet<String> observedFilenames = new TreeSet<String>();
                        for (Path p : analyzer.getFilesForCrawl(crawlid)) {
                            observedFilenames.add(p.toString());
                        }
                        for (Iterator<Path> it = todoFileList.iterator(); it.hasNext();) {
                            Path p = it.next();
                            if (observedFilenames.contains(p.toString())) {
                                it.remove();
                            }
                        }

                        // Get the dirs to process
                        TreeSet<String> observedDirnames = new TreeSet<String>();
                        for (Path p : analyzer.getDirsForCrawl(crawlid)) {
                            observedDirnames.add(p.toString());
                        }
                        for (Iterator<Path> it = todoDirList.iterator(); it.hasNext();) {
                            Path p = it.next();
                            if (observedDirnames.contains(p.toString())) {
                                it.remove();
                            }
                        }

                        synchronized (crawlStatusInfo) {
                            CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
                            cstatus.setMessage("Processing files");
                            cstatus.setNumToProcess(todoFileList.size());
                            cstatus.setNumDone(0);
                        }

                        int numDone = 0;
                        for (Path p : todoDirList) {
                            try {
                                analyzer.addSingleFile(fs, p, crawlid);
                            } catch (IOException iex) {
                                iex.printStackTrace();
                            }
                        }
                        for (Path p : todoFileList) {
                            synchronized (crawlStatusInfo) {
                                CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
                                cstatus.setMessage("Processing file " + p.toString());
                            }
                            try {
                                analyzer.addSingleFile(fs, p, crawlid);
                            } catch (Exception iex) {
                                iex.printStackTrace();
                            }
                            numDone++;
                            synchronized (crawlStatusInfo) {
                                CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
                                cstatus.setNumDone(numDone);
                                if (cstatus.shouldFinish()) {
                                    break;
                                }
                            }
                        }
                    } catch (IOException iex) {
                        iex.printStackTrace();
                    } finally {
                        try {
                            synchronized (pendingCrawls) {
                                pendingCrawls.remove(crawlid);
                                analyzer.completeCrawl(crawlid);
                            }
                        } catch (SQLiteException sle) {
                        }
                    }
                }
            };
            t.start();
        }
        return true;
    } catch (Exception iex) {
        iex.printStackTrace();
    }
    return false;
}

From source file:com.conductor.hadoop.WritableValueInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(final JobContext context) throws IOException, InterruptedException {
    final Configuration conf = context.getConfiguration();

    // init the reader
    final String filePath = conf.get(INPUT_FILE_LOCATION_CONF);
    checkArgument(!Strings.isNullOrEmpty(filePath), "Missing property: " + INPUT_FILE_LOCATION_CONF);

    final FileSystem fs = getFileSystem(conf);
    final Path path = fs.makeQualified(new Path(filePath));
    final SequenceFile.Reader reader = getReader(conf, path);

    // create the splits by looping through the values of the input file
    int totalInputs = 0;
    int maxInputsPerSplit = conf.getInt(INPUTS_PER_SPLIT_CONF, DEFAULT_INPUTS_PER_SPLIT);
    long pos = 0L;
    long last = 0L;
    long lengthRemaining = fs.getFileStatus(path).getLen();
    final List<InputSplit> splits = Lists.newArrayList();
    final V value = getV(conf);
    for (final NullWritable key = NullWritable.get(); reader.next(key, value); last = reader.getPosition()) {
        if (++totalInputs % maxInputsPerSplit == 0) {
            long splitSize = last - pos;
            splits.add(new FileSplit(path, pos, splitSize, null));
            lengthRemaining -= splitSize;
            pos = last;//  w w w . j  av a  2  s  .  c om
        }
    }
    // create the last split if there is data remaining
    if (lengthRemaining != 0) {
        splits.add(new FileSplit(path, pos, lengthRemaining, null));
    }
    return splits;
}