Example usage for org.apache.hadoop.fs FileSystem makeQualified

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem makeQualified.

Prototype

public Path makeQualified(Path path)

Source Link

Document

Qualify a path to one which uses this FileSystem and, if relative, made absolute.

Usage

From source file:com.cloudera.cdk.data.filesystem.FileSystemDatasetRepository.java

License:Apache License

/**
 * Get a {@link com.cloudera.cdk.data.PartitionKey} corresponding to a partition's filesystem path
 * represented as a {@link URI}. If the path is not a valid partition,
 * then {@link IllegalArgumentException} is thrown. Note that the partition does not
 * have to exist./*from   w w  w.  jav a2  s. c  o  m*/
 * @param dataset the filesystem dataset
 * @param partitionPath a directory path where the partition data is stored
 * @return a partition key representing the partition at the given path
 * @since 0.4.0
 */
@SuppressWarnings("deprecation")
public static PartitionKey partitionKeyForPath(Dataset dataset, URI partitionPath) {
    Preconditions.checkState(dataset.getDescriptor().isPartitioned(),
            "Attempt to get a partition on a non-partitioned dataset (name:%s)", dataset.getName());

    Preconditions.checkArgument(dataset instanceof FileSystemDataset, "Dataset is not a FileSystemDataset");
    FileSystemDataset fsDataset = (FileSystemDataset) dataset;

    FileSystem fs = fsDataset.getFileSystem();
    URI partitionUri = fs.makeQualified(new Path(partitionPath)).toUri();
    URI directoryUri = fsDataset.getDirectory().toUri();
    URI relativizedUri = directoryUri.relativize(partitionUri);

    if (relativizedUri.equals(partitionUri)) {
        throw new IllegalArgumentException(
                String.format("Partition URI %s has different " + "root directory to dataset (directory: %s).",
                        partitionUri, directoryUri));
    }

    Iterable<String> parts = Splitter.on('/').split(relativizedUri.getPath());

    PartitionStrategy partitionStrategy = dataset.getDescriptor().getPartitionStrategy();
    List<FieldPartitioner> fieldPartitioners = partitionStrategy.getFieldPartitioners();
    if (Iterables.size(parts) > fieldPartitioners.size()) {
        throw new IllegalArgumentException(
                String.format("Too many partition directories " + "for %s (%s), expecting %s.", partitionUri,
                        Iterables.size(parts), fieldPartitioners.size()));
    }

    List<Object> values = Lists.newArrayList();
    int i = 0;
    for (String part : parts) {
        Iterator<String> split = Splitter.on('=').split(part).iterator();
        String fieldName = split.next();
        FieldPartitioner fp = fieldPartitioners.get(i++);
        if (!fieldName.equals(fp.getName())) {
            throw new IllegalArgumentException(
                    String.format("Unrecognized partition name " + "'%s' in partition %s, expecting '%s'.",
                            fieldName, partitionUri, fp.getName()));
        }
        if (!split.hasNext()) {
            throw new IllegalArgumentException(String
                    .format("Missing partition value for " + "'%s' in partition %s.", fieldName, partitionUri));
        }
        String stringValue = split.next();
        Object value = fp.valueFromString(stringValue);
        values.add(value);
    }
    return com.cloudera.cdk.data.impl.Accessor.getDefault()
            .newPartitionKey(values.toArray(new Object[values.size()]));
}

From source file:com.cloudera.cdk.data.hcatalog.HiveUtils.java

License:Apache License

static DatasetDescriptor descriptorForTable(Configuration conf, Table table) {
    final DatasetDescriptor.Builder builder = new DatasetDescriptor.Builder();

    final String serializationLib = table.getSerializationLib();
    if (SERDE_TO_FORMAT.containsKey(serializationLib)) {
        builder.format(SERDE_TO_FORMAT.get(serializationLib));
    } else {//ww w . ja va  2  s  .co  m
        // TODO: should this use an "unknown" format? others fail in open()
        throw new UnknownFormatException("Unknown format for serde:" + serializationLib);
    }

    final Path dataLocation = new Path(table.getDataLocation());
    final FileSystem fs = fsForPath(conf, dataLocation);

    builder.location(fs.makeQualified(dataLocation));

    // custom properties
    String namesProperty = table.getProperty(CUSTOM_PROPERTIES_PROPERTY_NAME);
    if (namesProperty != null) {
        for (String property : NAME_SPLITTER.split(namesProperty)) {
            builder.property(property, table.getProperty(property));
        }
    }

    if (table.getProperty(PARTITION_EXPRESSION_PROPERTY_NAME) != null) {
        builder.partitionStrategy(
                Accessor.getDefault().fromExpression(table.getProperty(PARTITION_EXPRESSION_PROPERTY_NAME)));
    }

    String schemaUrlString = table.getProperty(AVRO_SCHEMA_URL_PROPERTY_NAME);
    if (schemaUrlString != null) {
        try {
            builder.schemaUri(new URI(schemaUrlString));
        } catch (IOException e) {
            throw new MetadataProviderException("Could not read schema", e);
        } catch (URISyntaxException e) {
            // this library sets the URI, so it should always be valid
            throw new MetadataProviderException("[BUG] Invalid schema URI", e);
        }
    }

    String schemaLiteral = table.getProperty(AVRO_SCHEMA_LITERAL_PROPERTY_NAME);
    if (schemaLiteral != null) {
        builder.schemaLiteral(schemaLiteral);
    }

    try {
        return builder.build();
    } catch (IllegalStateException ex) {
        throw new MetadataProviderException("Cannot find schema: missing metadata");
    }
}

From source file:com.cloudera.cdk.data.TestDatasetDescriptor.java

License:Apache License

@Test
public void testSchemaFromHdfs() throws IOException {
    FileSystem fs = getDFS();

    // copy a schema to HDFS
    Path schemaPath = fs.makeQualified(new Path("schema.avsc"));
    FSDataOutputStream out = fs.create(schemaPath);
    IOUtils.copyBytes(USER_SCHEMA_URL.toURL().openStream(), out, fs.getConf());
    out.close();/*w ww.  j  a  v  a  2  s. c  o  m*/

    // build a schema using the HDFS path and check it's the same
    Schema schema = new DatasetDescriptor.Builder().schemaUri(schemaPath.toUri()).build().getSchema();

    Assert.assertEquals(USER_SCHEMA, schema);
}

From source file:com.cloudera.impala.common.FileSystemUtil.java

License:Apache License

/**
 * Return true iff the path is on the given filesystem.
 *//*from  w ww.  java 2  s. c  o m*/
public static Boolean isPathOnFileSystem(Path path, FileSystem fs) {
    try {
        // Call makeQualified() for the side-effect of FileSystem.checkPath() which will
        // throw an exception if path is not on fs.
        fs.makeQualified(path);
        return true;
    } catch (IllegalArgumentException e) {
        // Path is not on fs.
        return false;
    }
}

From source file:com.cloudera.kitten.lua.AsapLuaContainerLaunchParameters.java

License:Open Source License

private void addOperatorInputs(Map<String, LocalResource> localResources) throws IOException {
    LOG.info("Inputs: " + operator.getInputFiles());
    FileSystem fs = FileSystem.get(conf);
    for (Entry<String, String> e : operator.getInputFiles().entrySet()) {
        if ((!e.getValue().startsWith("hdfs://")) && (!e.getValue().startsWith("$HDFS"))) {
            LOG.info("adding local resource: " + e);
            String inDir = dir;//from ww  w .j a va 2s .  c om
            LocalResource rsrc = Records.newRecord(LocalResource.class);
            rsrc.setType(LocalResourceType.FILE);
            rsrc.setVisibility(LocalResourceVisibility.APPLICATION);
            LOG.info("Adding input: " + inDir + "/" + e.getValue());
            Path dst = new Path(inDir + "/" + e.getValue());
            dst = fs.makeQualified(dst);
            FileStatus stat = fs.getFileStatus(dst);
            rsrc.setSize(stat.getLen());
            rsrc.setTimestamp(stat.getModificationTime());
            rsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst));
            localResources.put(e.getKey(), rsrc);
        }
    }
    /*for(String in : operator.getArguments().split(" ")){
       LOG.info("Adding input: "+in);
       LocalResource nl = constructScriptResource();
       localResources.put(in, nl);
    }*/
}

From source file:com.cloudera.kitten.lua.AsapLuaContainerLaunchParameters.java

License:Open Source License

private void configureLocalScriptResourceForPath(LocalResource rsrc, Path path) throws IOException {
    //System.out.println("URI: "+path.toUri());
    FileSystem fs = FileSystem.get(conf);

    Path dst = new Path(dir + "/" + path.getName());
    fs.moveFromLocalFile(path, dst);/*  w w  w .j  a  v  a2 s. co  m*/
    dst = fs.makeQualified(dst);

    FileStatus stat = fs.getFileStatus(dst);
    rsrc.setSize(stat.getLen());
    rsrc.setTimestamp(stat.getModificationTime());
    rsrc.setResource(ConverterUtils.getYarnUrlFromPath(dst));
}

From source file:com.cloudera.oryx.lambda.batch.BatchUpdateFunction.java

License:Open Source License

/**
 * @return paths from {@link FileStatus}es into one comma-separated String
 * @see FileInputFormat#addInputPath(org.apache.hadoop.mapreduce.Job, Path)
 *//* ww w  .j  ava2 s. c o m*/
private static String joinFSPaths(FileSystem fs, FileStatus[] statuses) {
    StringBuilder joined = new StringBuilder();
    for (FileStatus status : statuses) {
        if (joined.length() > 0) {
            joined.append(',');
        }
        Path path = fs.makeQualified(status.getPath());
        joined.append(StringUtils.escapeString(path.toString()));
    }
    return joined.toString();
}

From source file:com.cloudera.oryx.ml.MLUpdate.java

License:Open Source License

@Override
public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<Object, M> newKeyMessageData,
        JavaPairRDD<Object, M> pastKeyMessageData, String modelDirString,
        TopicProducer<String, String> modelUpdateTopic) throws IOException, InterruptedException {

    Objects.requireNonNull(newKeyMessageData);

    JavaRDD<M> newData = newKeyMessageData.values();
    JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values();

    if (newData != null) {
        newData.cache();//from ww w .j  av a2 s.  co  m
        // This forces caching of the RDD. This shouldn't be necessary but we see some freezes
        // when many workers try to materialize the RDDs at once. Hence the workaround.
        newData.foreachPartition(p -> {
        });
    }
    if (pastData != null) {
        pastData.cache();
        pastData.foreachPartition(p -> {
        });
    }

    List<HyperParamValues<?>> hyperParamValues = getHyperParameterValues();
    int valuesPerHyperParam = HyperParams.chooseValuesPerHyperParam(hyperParamValues.size(), candidates);
    List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos(hyperParamValues, candidates,
            valuesPerHyperParam);

    Path modelDir = new Path(modelDirString);
    Path tempModelPath = new Path(modelDir, ".temporary");
    Path candidatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis()));

    FileSystem fs = FileSystem.get(modelDir.toUri(), sparkContext.hadoopConfiguration());
    fs.mkdirs(candidatesPath);

    Path bestCandidatePath = findBestCandidatePath(sparkContext, newData, pastData, hyperParameterCombos,
            candidatesPath);

    Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis()));
    if (bestCandidatePath == null) {
        log.info("Unable to build any model");
    } else {
        // Move best model into place
        fs.rename(bestCandidatePath, finalPath);
    }
    // Then delete everything else
    fs.delete(candidatesPath, true);

    if (modelUpdateTopic == null) {
        log.info("No update topic configured, not publishing models to a topic");
    } else {
        // Push PMML model onto update topic, if it exists
        Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME);
        if (fs.exists(bestModelPath)) {
            FileStatus bestModelPathFS = fs.getFileStatus(bestModelPath);
            PMML bestModel = null;
            boolean modelNeededForUpdates = canPublishAdditionalModelData();
            boolean modelNotTooLarge = bestModelPathFS.getLen() <= maxMessageSize;
            if (modelNeededForUpdates || modelNotTooLarge) {
                // Either the model is required for publishAdditionalModelData, or required because it's going to
                // be serialized to Kafka
                try (InputStream in = fs.open(bestModelPath)) {
                    bestModel = PMMLUtils.read(in);
                }
            }

            if (modelNotTooLarge) {
                modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel));
            } else {
                modelUpdateTopic.send("MODEL-REF", fs.makeQualified(bestModelPath).toString());
            }

            if (modelNeededForUpdates) {
                publishAdditionalModelData(sparkContext, bestModel, newData, pastData, finalPath,
                        modelUpdateTopic);
            }
        }
    }

    if (newData != null) {
        newData.unpersist();
    }
    if (pastData != null) {
        pastData.unpersist();
    }
}

From source file:com.cloudera.recordbreaker.analyzer.FSCrawler.java

License:Open Source License

/**
 * <code>getStartNonblockingCrawl</code> traverses a given filesystem.  It returns immediately
 * and does not wait for the crawl to complete.
 * If the crawl is created or is already ongoing, it returns true.
 * If the crawl is not currently going and cannot start, it returns false. 
 *///w  w w  .j  av a2  s  .c o  m
public synchronized boolean getStartNonblockingCrawl(final URI fsURI) {
    try {
        final int subdirDepth = INFINITE_CRAWL_DEPTH;
        long fsId = analyzer.getCreateFilesystem(fsURI, true);
        if (fsId < 0) {
            return false;
        }
        LOG.info("Grabbing filesystem: " + fsURI);
        final FileSystem fs = FileSystem.get(fsURI, new Configuration());
        final Path startDir = fs.makeQualified(new Path(fsURI.getPath()));

        final long crawlid = analyzer.getCreatePendingCrawl(fsId, true);
        Thread pendingThread = pendingCrawls.get(crawlid);
        if (pendingThread == null) {
            Thread t = new Thread() {
                public void run() {
                    try {
                        synchronized (pendingCrawls) {
                            pendingCrawls.put(crawlid, this);
                        }
                        synchronized (crawlStatusInfo) {
                            crawlStatusInfo.put(crawlid, new CrawlRuntimeStatus("Initializing crawl"));
                        }
                        // Build the file and dir-level todo lists
                        List<Path> todoFileList = new ArrayList<Path>();
                        List<Path> todoDirList = new ArrayList<Path>();
                        recursiveCrawlBuildList(fs, startDir, subdirDepth, crawlid, todoFileList, todoDirList);

                        // Get the files to process
                        TreeSet<String> observedFilenames = new TreeSet<String>();
                        for (Path p : analyzer.getFilesForCrawl(crawlid)) {
                            observedFilenames.add(p.toString());
                        }
                        for (Iterator<Path> it = todoFileList.iterator(); it.hasNext();) {
                            Path p = it.next();
                            if (observedFilenames.contains(p.toString())) {
                                it.remove();
                            }
                        }

                        // Get the dirs to process
                        TreeSet<String> observedDirnames = new TreeSet<String>();
                        for (Path p : analyzer.getDirsForCrawl(crawlid)) {
                            observedDirnames.add(p.toString());
                        }
                        for (Iterator<Path> it = todoDirList.iterator(); it.hasNext();) {
                            Path p = it.next();
                            if (observedDirnames.contains(p.toString())) {
                                it.remove();
                            }
                        }

                        synchronized (crawlStatusInfo) {
                            CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
                            cstatus.setMessage("Processing files");
                            cstatus.setNumToProcess(todoFileList.size());
                            cstatus.setNumDone(0);
                        }

                        int numDone = 0;
                        for (Path p : todoDirList) {
                            try {
                                analyzer.addSingleFile(fs, p, crawlid);
                            } catch (IOException iex) {
                                iex.printStackTrace();
                            }
                        }
                        for (Path p : todoFileList) {
                            synchronized (crawlStatusInfo) {
                                CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
                                cstatus.setMessage("Processing file " + p.toString());
                            }
                            try {
                                analyzer.addSingleFile(fs, p, crawlid);
                            } catch (Exception iex) {
                                iex.printStackTrace();
                            }
                            numDone++;
                            synchronized (crawlStatusInfo) {
                                CrawlRuntimeStatus cstatus = crawlStatusInfo.get(crawlid);
                                cstatus.setNumDone(numDone);
                                if (cstatus.shouldFinish()) {
                                    break;
                                }
                            }
                        }
                    } catch (IOException iex) {
                        iex.printStackTrace();
                    } finally {
                        try {
                            synchronized (pendingCrawls) {
                                pendingCrawls.remove(crawlid);
                                analyzer.completeCrawl(crawlid);
                            }
                        } catch (SQLiteException sle) {
                        }
                    }
                }
            };
            t.start();
        }
        return true;
    } catch (Exception iex) {
        iex.printStackTrace();
    }
    return false;
}

From source file:com.conductor.hadoop.WritableValueInputFormat.java

License:Apache License

@Override
public List<InputSplit> getSplits(final JobContext context) throws IOException, InterruptedException {
    final Configuration conf = context.getConfiguration();

    // init the reader
    final String filePath = conf.get(INPUT_FILE_LOCATION_CONF);
    checkArgument(!Strings.isNullOrEmpty(filePath), "Missing property: " + INPUT_FILE_LOCATION_CONF);

    final FileSystem fs = getFileSystem(conf);
    final Path path = fs.makeQualified(new Path(filePath));
    final SequenceFile.Reader reader = getReader(conf, path);

    // create the splits by looping through the values of the input file
    int totalInputs = 0;
    int maxInputsPerSplit = conf.getInt(INPUTS_PER_SPLIT_CONF, DEFAULT_INPUTS_PER_SPLIT);
    long pos = 0L;
    long last = 0L;
    long lengthRemaining = fs.getFileStatus(path).getLen();
    final List<InputSplit> splits = Lists.newArrayList();
    final V value = getV(conf);
    for (final NullWritable key = NullWritable.get(); reader.next(key, value); last = reader.getPosition()) {
        if (++totalInputs % maxInputsPerSplit == 0) {
            long splitSize = last - pos;
            splits.add(new FileSplit(path, pos, splitSize, null));
            lengthRemaining -= splitSize;
            pos = last;//  w w w . j  av a  2  s  .  c om
        }
    }
    // create the last split if there is data remaining
    if (lengthRemaining != 0) {
        splits.add(new FileSplit(path, pos, lengthRemaining, null));
    }
    return splits;
}