Example usage for org.apache.hadoop.fs Path toUri

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toUri.

Prototype

public URI toUri()

Source Link

Document

Convert this Path to a URI.

Usage

From source file:com.cloudera.oryx.app.pmml.AppPMMLUtils.java

License:Open Source License

public static PMML readPMMLFromUpdateKeyMessage(String key, String message, Configuration hadoopConf)
        throws IOException {
    String pmmlString;/*from   w  ww  .java  2 s .  c om*/
    switch (key) {
    case "MODEL":
        pmmlString = message;
        break;
    case "MODEL-REF":
        // Allowing null is mostly for integration tests
        if (hadoopConf == null) {
            hadoopConf = new Configuration();
        }
        Path messagePath = new Path(message);
        FileSystem fs = FileSystem.get(messagePath.toUri(), hadoopConf);
        try (InputStreamReader in = new InputStreamReader(fs.open(messagePath), StandardCharsets.UTF_8)) {
            pmmlString = CharStreams.toString(in);
        } catch (FileNotFoundException fnfe) {
            log.warn("Unable to load model file at {}; ignoring", messagePath);
            return null;
        }
        break;
    default:
        throw new IllegalArgumentException("Unknown key " + key);
    }
    return PMMLUtils.fromString(pmmlString);
}

From source file:com.cloudera.oryx.lambda.batch.BatchUpdateFunction.java

License:Open Source License

@Override
public void call(JavaPairRDD<K, M> newData, Time timestamp) throws IOException, InterruptedException {

    if (newData.isEmpty()) {
        log.info("No data in current generation's RDD; nothing to do");
        return;//from   w  ww.j a  v  a2 s .c o  m
    }

    log.info("Beginning update at {}", timestamp);

    Configuration hadoopConf = sparkContext.hadoopConfiguration();
    if (hadoopConf.getResource("core-site.xml") == null) {
        log.warn("Hadoop config like core-site.xml was not found; "
                + "is the Hadoop config directory on the classpath?");
    }

    JavaPairRDD<K, M> pastData;
    Path inputPathPattern = new Path(dataDirString + "/*/part-*");
    FileSystem fs = FileSystem.get(inputPathPattern.toUri(), hadoopConf);
    FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern);
    if (inputPathStatuses == null || inputPathStatuses.length == 0) {

        log.info("No past data at path(s) {}", inputPathPattern);
        pastData = null;

    } else {

        log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath());
        Configuration updatedConf = new Configuration(hadoopConf);
        updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses));

        @SuppressWarnings("unchecked")
        JavaPairRDD<Writable, Writable> pastWritableData = (JavaPairRDD<Writable, Writable>) sparkContext
                .newAPIHadoopRDD(updatedConf, SequenceFileInputFormat.class, keyWritableClass,
                        messageWritableClass);

        pastData = pastWritableData.mapToPair(
                new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass));
    }

    if (updateTopic == null || updateBroker == null) {
        log.info("Not producing updates to update topic since none was configured");
        updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString,
                null);
    } else {
        // This TopicProducer should not be async; sends one big model generally and
        // needs to occur before other updates reliably rather than be buffered
        try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic, false)) {
            updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString,
                    producer);
        }
    }
}

From source file:com.cloudera.oryx.lambda.DeleteOldDataFn.java

License:Open Source License

@Override
public void call(T ignored) throws IOException {
    Path dataDirPath = new Path(dataDirString + "/*");
    FileSystem fs = FileSystem.get(dataDirPath.toUri(), hadoopConf);
    FileStatus[] inputPathStatuses = fs.globStatus(dataDirPath);
    if (inputPathStatuses != null) {
        long oldestTimeAllowed = System.currentTimeMillis()
                - TimeUnit.MILLISECONDS.convert(maxAgeHours, TimeUnit.HOURS);
        Arrays.stream(inputPathStatuses).filter(FileStatus::isDirectory).map(FileStatus::getPath)
                .filter(subdir -> {/*from  ww w. j  av a 2  s .c  o m*/
                    Matcher m = dirTimestampPattern.matcher(subdir.getName());
                    return m.find() && Long.parseLong(m.group(1)) < oldestTimeAllowed;
                }).forEach(subdir -> {
                    log.info("Deleting old data at {}", subdir);
                    try {
                        fs.delete(subdir, true);
                    } catch (IOException e) {
                        log.warn("Unable to delete {}; continuing", subdir, e);
                    }
                });
    }
}

From source file:com.cloudera.oryx.ml.MLUpdate.java

License:Open Source License

@Override
public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<Object, M> newKeyMessageData,
        JavaPairRDD<Object, M> pastKeyMessageData, String modelDirString,
        TopicProducer<String, String> modelUpdateTopic) throws IOException, InterruptedException {

    Objects.requireNonNull(newKeyMessageData);

    JavaRDD<M> newData = newKeyMessageData.values();
    JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values();

    if (newData != null) {
        newData.cache();/*  w  ww .j  a v a  2  s .c  o  m*/
        // This forces caching of the RDD. This shouldn't be necessary but we see some freezes
        // when many workers try to materialize the RDDs at once. Hence the workaround.
        newData.foreachPartition(p -> {
        });
    }
    if (pastData != null) {
        pastData.cache();
        pastData.foreachPartition(p -> {
        });
    }

    List<HyperParamValues<?>> hyperParamValues = getHyperParameterValues();
    int valuesPerHyperParam = HyperParams.chooseValuesPerHyperParam(hyperParamValues.size(), candidates);
    List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos(hyperParamValues, candidates,
            valuesPerHyperParam);

    Path modelDir = new Path(modelDirString);
    Path tempModelPath = new Path(modelDir, ".temporary");
    Path candidatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis()));

    FileSystem fs = FileSystem.get(modelDir.toUri(), sparkContext.hadoopConfiguration());
    fs.mkdirs(candidatesPath);

    Path bestCandidatePath = findBestCandidatePath(sparkContext, newData, pastData, hyperParameterCombos,
            candidatesPath);

    Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis()));
    if (bestCandidatePath == null) {
        log.info("Unable to build any model");
    } else {
        // Move best model into place
        fs.rename(bestCandidatePath, finalPath);
    }
    // Then delete everything else
    fs.delete(candidatesPath, true);

    if (modelUpdateTopic == null) {
        log.info("No update topic configured, not publishing models to a topic");
    } else {
        // Push PMML model onto update topic, if it exists
        Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME);
        if (fs.exists(bestModelPath)) {
            FileStatus bestModelPathFS = fs.getFileStatus(bestModelPath);
            PMML bestModel = null;
            boolean modelNeededForUpdates = canPublishAdditionalModelData();
            boolean modelNotTooLarge = bestModelPathFS.getLen() <= maxMessageSize;
            if (modelNeededForUpdates || modelNotTooLarge) {
                // Either the model is required for publishAdditionalModelData, or required because it's going to
                // be serialized to Kafka
                try (InputStream in = fs.open(bestModelPath)) {
                    bestModel = PMMLUtils.read(in);
                }
            }

            if (modelNotTooLarge) {
                modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel));
            } else {
                modelUpdateTopic.send("MODEL-REF", fs.makeQualified(bestModelPath).toString());
            }

            if (modelNeededForUpdates) {
                publishAdditionalModelData(sparkContext, bestModel, newData, pastData, finalPath,
                        modelUpdateTopic);
            }
        }
    }

    if (newData != null) {
        newData.unpersist();
    }
    if (pastData != null) {
        pastData.unpersist();
    }
}

From source file:com.cloudera.oryx.ml.MLUpdate.java

License:Open Source License

private Path findBestCandidatePath(JavaSparkContext sparkContext, JavaRDD<M> newData, JavaRDD<M> pastData,
        List<List<?>> hyperParameterCombos, Path candidatesPath) throws IOException {
    Map<Path, Double> pathToEval = ExecUtils.collectInParallel(candidates,
            Math.min(evalParallelism, candidates), true,
            i -> buildAndEval(i, hyperParameterCombos, sparkContext, newData, pastData, candidatesPath),
            Collectors.toMap(Pair::getFirst, Pair::getSecond));

    FileSystem fs = null;/*from  www . j a va 2  s  . c o m*/
    Path bestCandidatePath = null;
    double bestEval = Double.NEGATIVE_INFINITY;
    for (Map.Entry<Path, Double> pathEval : pathToEval.entrySet()) {
        Path path = pathEval.getKey();
        if (fs == null) {
            fs = FileSystem.get(path.toUri(), sparkContext.hadoopConfiguration());
        }
        if (path != null && fs.exists(path)) {
            Double eval = pathEval.getValue();
            if (!Double.isNaN(eval)) {
                // Valid evaluation; if it's the best so far, keep it
                if (eval > bestEval) {
                    log.info("Best eval / model path is now {} / {}", eval, path);
                    bestEval = eval;
                    bestCandidatePath = path;
                }
            } else if (bestCandidatePath == null && testFraction == 0.0) {
                // Normal case when eval is disabled; no eval is possible, but keep the one model
                // that was built
                bestCandidatePath = path;
            }
        } // else can't do anything; no model at all
    }
    if (threshold != null && bestEval < threshold) {
        log.info("Best model at {} had eval {}, but did not exceed threshold {}; discarding model",
                bestCandidatePath, bestEval, threshold);
        bestCandidatePath = null;
    }
    return bestCandidatePath;
}

From source file:com.cloudera.oryx.ml.MLUpdate.java

License:Open Source License

private Pair<Path, Double> buildAndEval(int i, List<List<?>> hyperParameterCombos,
        JavaSparkContext sparkContext, JavaRDD<M> newData, JavaRDD<M> pastData, Path candidatesPath) {
    // % = cycle through combinations if needed
    List<?> hyperParameters = hyperParameterCombos.get(i % hyperParameterCombos.size());
    Path candidatePath = new Path(candidatesPath, Integer.toString(i));
    log.info("Building candidate {} with params {}", i, hyperParameters);

    Pair<JavaRDD<M>, JavaRDD<M>> trainTestData = splitTrainTest(newData, pastData);
    JavaRDD<M> allTrainData = trainTestData.getFirst();
    JavaRDD<M> testData = trainTestData.getSecond();

    Double eval = Double.NaN;
    if (empty(allTrainData)) {
        log.info("No train data to build a model");
    } else {//from   ww  w  .  j  av a 2  s.  c  o m
        PMML model = buildModel(sparkContext, allTrainData, hyperParameters, candidatePath);
        if (model == null) {
            log.info("Unable to build a model");
        } else {
            Path modelPath = new Path(candidatePath, MODEL_FILE_NAME);
            log.info("Writing model to {}", modelPath);
            try {
                FileSystem fs = FileSystem.get(candidatePath.toUri(), sparkContext.hadoopConfiguration());
                fs.mkdirs(candidatePath);
                try (OutputStream out = fs.create(modelPath)) {
                    PMMLUtils.write(model, out);
                }
            } catch (IOException ioe) {
                throw new IllegalStateException(ioe);
            }
            if (empty(testData)) {
                log.info("No test data available to evaluate model");
            } else {
                log.info("Evaluating model");
                eval = evaluate(sparkContext, model, candidatePath, testData, allTrainData);
            }
        }
    }

    log.info("Model eval for params {}: {} ({})", hyperParameters, eval, candidatePath);
    return new Pair<>(candidatePath, eval);
}

From source file:com.cloudera.recordservice.hive.RecordServiceHiveInputFormat.java

License:Apache License

/**
 * Copied HiveInputFormat/*  w  ww  .  j a  v a2 s . c o  m*/
 */
@Override
public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException {
    init(job);

    Path[] dirs = FileInputFormat.getInputPaths(job);
    if (dirs.length == 0) {
        throw new IOException("No input paths specified in job");
    }
    JobConf newjob = new JobConf(job);
    List<InputSplit> result = new ArrayList<InputSplit>();

    List<Path> currentDirs = new ArrayList<Path>();
    Class<? extends InputFormat> currentInputFormatClass = null;
    TableDesc currentTable = null;
    TableScanOperator currentTableScan = null;

    // for each dir, get the InputFormat, and do getSplits.
    for (Path dir : dirs) {
        PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir);
        Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass();
        TableDesc table = part.getTableDesc();
        TableScanOperator tableScan = null;

        List<String> aliases = mrwork_.getPathToAliases().get(dir.toUri().toString());

        // Make filter pushdown information available to getSplits.
        if ((aliases != null) && (aliases.size() == 1)) {
            Operator op = mrwork_.getAliasToWork().get(aliases.get(0));
            if ((op != null) && (op instanceof TableScanOperator)) {
                tableScan = (TableScanOperator) op;
                // push down projections.
                ColumnProjectionUtils.appendReadColumns(newjob, tableScan.getNeededColumnIDs(),
                        tableScan.getNeededColumns());
                // push down filters
                pushFilters(newjob, tableScan);
            }
        }

        if (!currentDirs.isEmpty() && inputFormatClass.equals(currentInputFormatClass)
                && table.equals(currentTable) && tableScan == currentTableScan) {
            currentDirs.add(dir);
            continue;
        }

        if (!currentDirs.isEmpty()) {
            LOG.info("Generating splits");
            addSplitsForGroup(currentDirs, currentTableScan, newjob,
                    getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass,
                    currentDirs.size() * (numSplits / dirs.length), currentTable, result);
        }

        currentDirs.clear();
        currentDirs.add(dir);
        currentTableScan = tableScan;
        currentTable = table;
        currentInputFormatClass = inputFormatClass;
    }

    if (dirs.length != 0) {
        LOG.info("Generating splits");
        addSplitsForGroup(currentDirs, currentTableScan, newjob,
                getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass,
                currentDirs.size() * (numSplits / dirs.length), currentTable, result);
    }

    LOG.info("number of splits " + result.size());
    return result.toArray(new HiveInputSplitShim[result.size()]);
}

From source file:com.cloudera.science.ml.client.params.RecordOutputParameters.java

License:Open Source License

private void createHiveTable(Spec spec, String outputPath) throws IOException {
    if (hiveStr != null) {
        String dbName = HCatalog.getDbName(hiveStr);
        String tblName = HCatalog.getTableName(hiveStr);
        if (HCatalog.tableExists(dbName, tblName)) {
            LOG.warn("Hive table named " + hiveStr + " already exists");
            return;
        }//from   www.jav  a  2  s. co m
        LOG.info("Creating an external Hive table named: " + hiveStr);
        Table tbl = new Table(dbName, tblName);
        tbl.setOwner(UserGroupInformation.getCurrentUser().getShortUserName());
        tbl.setTableType(TableType.EXTERNAL_TABLE);
        tbl.setProperty("EXTERNAL", "TRUE");
        Path output = FileSystem.get(new Configuration()).makeQualified(new Path(outputPath));
        tbl.setDataLocation(output.toUri());
        List<FieldSchema> fields = Lists.newArrayList();
        for (int i = 0; i < spec.size(); i++) {
            FieldSpec fs = spec.getField(i);
            FieldSchema hfs = new FieldSchema();
            hfs.setName(fs.name());
            switch (fs.spec().getDataType()) {
            case BOOLEAN:
                hfs.setType("boolean");
                break;
            case INT:
                hfs.setType("int");
                break;
            case LONG:
                hfs.setType("bigint");
                break;
            case DOUBLE:
                hfs.setType("double");
                break;
            case STRING:
                hfs.setType("string");
                break;
            }
            fields.add(hfs);
        }
        tbl.setFields(fields);
        if (FORMAT_AVRO.equals(outputType)) {
            try {
                tbl.setSerializationLib("org.apache.hadoop.hive.serde2.avro.AvroSerDe");
                tbl.setInputFormatClass("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat");
                tbl.setOutputFormatClass("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat");
                tbl.setProperty("avro.schema.literal", Spec2Schema.create(spec).toString());
            } catch (Exception e) {
                LOG.error("Error configured Hive Avro table, table creation failed", e);
                return;
            }
        } else { // FORMAT_CSV
            try {
                tbl.setSerializationLib("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe");
                tbl.setSerdeParam("field.delim", delim);
                tbl.setSerdeParam("serialization.format", ",");
                tbl.setInputFormatClass("org.apache.hadoop.mapred.TextInputFormat");
                tbl.setOutputFormatClass("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat");
            } catch (Exception e) {
                LOG.error("Error configuring Hive for CSV files, table creation failed", e);
                return;
            }
        }
        HCatalog.createTable(tbl);
    }
}

From source file:com.conversantmedia.mapreduce.tool.DistributedResourceManager.java

License:Apache License

/**
 * Register this resource. If the resource is a simple property (i.e. primative or String),
 * it will place it in the configuration. Otherwise, it uses the distributed cache
 * mechanism as required./*  ww w . ja v  a2s.c  om*/
 * 
 * @param key          the resource key. Usually a property/field name.
 * @param value       the resource.
 * @throws IOException   if resource cannot be serialized
 *
 */
public void registerResource(String key, Object value) throws IOException {
    if (value == null) {
        return;
    }

    String valueString = null;

    // First, determine our approach:
    if (value instanceof String) {
        valueString = (String) value;
    } else if (Primitives.isWrapperType(value.getClass())) {
        valueString = String.valueOf(value);
    }
    // If this is a Path or File object we'll place it
    // on the distributed cache
    else if (value instanceof Path) {
        Path path = (Path) value;
        valueString = path.getName();
        // Distribute the file the new way
        this.job.addCacheFile(path.toUri());
    } else if (value instanceof File) {
        File file = (File) value;
        valueString = file.getName();

        // Distribute the file
        distributeLocalFile(file);
    }
    // Check if it's serializable
    else if (value instanceof java.io.Serializable) {
        // Serialize the object and place it on the distributed cache
        ObjectOutputStream out = null;
        try {
            File beanSerFile = File.createTempFile(value.getClass().getName(), ".ser");
            FileOutputStream fileOut = new FileOutputStream(beanSerFile);
            out = new ObjectOutputStream(fileOut);
            out.writeObject(value);
            valueString = beanSerFile.getName();

            // Distribute the file
            distributeLocalFile(beanSerFile);
        } finally {
            IOUtils.closeQuietly(out);
        }
    } else {
        throw new IllegalArgumentException("Resource [" + key + "] is not serializable.");
    }

    // Setup the config key
    String configKey = CONFIGKEYBASE_RESOURCE + key;
    getConf().set(configKey, value.getClass().getName() + VALUE_SEP + valueString);
}

From source file:com.conversantmedia.mapreduce.tool.DistributedResourceManager.java

License:Apache License

private Path distributeLocalFile(File file) throws IOException {
    FileSystem fs = FileSystem.get(getConf());
    Path dest = new Path(file.getAbsolutePath());
    fs.copyFromLocalFile(true, true, new Path(file.getAbsolutePath()), dest);
    this.job.addCacheFile(dest.toUri());
    return dest;//  w w w.  j av a 2s . co  m
}