List of usage examples for org.apache.hadoop.fs Path toUri
public URI toUri()
From source file:com.cloudera.oryx.app.pmml.AppPMMLUtils.java
License:Open Source License
public static PMML readPMMLFromUpdateKeyMessage(String key, String message, Configuration hadoopConf) throws IOException { String pmmlString;/*from w ww .java 2 s . c om*/ switch (key) { case "MODEL": pmmlString = message; break; case "MODEL-REF": // Allowing null is mostly for integration tests if (hadoopConf == null) { hadoopConf = new Configuration(); } Path messagePath = new Path(message); FileSystem fs = FileSystem.get(messagePath.toUri(), hadoopConf); try (InputStreamReader in = new InputStreamReader(fs.open(messagePath), StandardCharsets.UTF_8)) { pmmlString = CharStreams.toString(in); } catch (FileNotFoundException fnfe) { log.warn("Unable to load model file at {}; ignoring", messagePath); return null; } break; default: throw new IllegalArgumentException("Unknown key " + key); } return PMMLUtils.fromString(pmmlString); }
From source file:com.cloudera.oryx.lambda.batch.BatchUpdateFunction.java
License:Open Source License
@Override public void call(JavaPairRDD<K, M> newData, Time timestamp) throws IOException, InterruptedException { if (newData.isEmpty()) { log.info("No data in current generation's RDD; nothing to do"); return;//from w ww.j a v a2 s .c o m } log.info("Beginning update at {}", timestamp); Configuration hadoopConf = sparkContext.hadoopConfiguration(); if (hadoopConf.getResource("core-site.xml") == null) { log.warn("Hadoop config like core-site.xml was not found; " + "is the Hadoop config directory on the classpath?"); } JavaPairRDD<K, M> pastData; Path inputPathPattern = new Path(dataDirString + "/*/part-*"); FileSystem fs = FileSystem.get(inputPathPattern.toUri(), hadoopConf); FileStatus[] inputPathStatuses = fs.globStatus(inputPathPattern); if (inputPathStatuses == null || inputPathStatuses.length == 0) { log.info("No past data at path(s) {}", inputPathPattern); pastData = null; } else { log.info("Found past data at path(s) like {}", inputPathStatuses[0].getPath()); Configuration updatedConf = new Configuration(hadoopConf); updatedConf.set(FileInputFormat.INPUT_DIR, joinFSPaths(fs, inputPathStatuses)); @SuppressWarnings("unchecked") JavaPairRDD<Writable, Writable> pastWritableData = (JavaPairRDD<Writable, Writable>) sparkContext .newAPIHadoopRDD(updatedConf, SequenceFileInputFormat.class, keyWritableClass, messageWritableClass); pastData = pastWritableData.mapToPair( new WritableToValueFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)); } if (updateTopic == null || updateBroker == null) { log.info("Not producing updates to update topic since none was configured"); updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString, null); } else { // This TopicProducer should not be async; sends one big model generally and // needs to occur before other updates reliably rather than be buffered try (TopicProducer<String, U> producer = new TopicProducerImpl<>(updateBroker, updateTopic, false)) { updateInstance.runUpdate(sparkContext, timestamp.milliseconds(), newData, pastData, modelDirString, producer); } } }
From source file:com.cloudera.oryx.lambda.DeleteOldDataFn.java
License:Open Source License
@Override public void call(T ignored) throws IOException { Path dataDirPath = new Path(dataDirString + "/*"); FileSystem fs = FileSystem.get(dataDirPath.toUri(), hadoopConf); FileStatus[] inputPathStatuses = fs.globStatus(dataDirPath); if (inputPathStatuses != null) { long oldestTimeAllowed = System.currentTimeMillis() - TimeUnit.MILLISECONDS.convert(maxAgeHours, TimeUnit.HOURS); Arrays.stream(inputPathStatuses).filter(FileStatus::isDirectory).map(FileStatus::getPath) .filter(subdir -> {/*from ww w. j av a 2 s .c o m*/ Matcher m = dirTimestampPattern.matcher(subdir.getName()); return m.find() && Long.parseLong(m.group(1)) < oldestTimeAllowed; }).forEach(subdir -> { log.info("Deleting old data at {}", subdir); try { fs.delete(subdir, true); } catch (IOException e) { log.warn("Unable to delete {}; continuing", subdir, e); } }); } }
From source file:com.cloudera.oryx.ml.MLUpdate.java
License:Open Source License
@Override public void runUpdate(JavaSparkContext sparkContext, long timestamp, JavaPairRDD<Object, M> newKeyMessageData, JavaPairRDD<Object, M> pastKeyMessageData, String modelDirString, TopicProducer<String, String> modelUpdateTopic) throws IOException, InterruptedException { Objects.requireNonNull(newKeyMessageData); JavaRDD<M> newData = newKeyMessageData.values(); JavaRDD<M> pastData = pastKeyMessageData == null ? null : pastKeyMessageData.values(); if (newData != null) { newData.cache();/* w ww .j a v a 2 s .c o m*/ // This forces caching of the RDD. This shouldn't be necessary but we see some freezes // when many workers try to materialize the RDDs at once. Hence the workaround. newData.foreachPartition(p -> { }); } if (pastData != null) { pastData.cache(); pastData.foreachPartition(p -> { }); } List<HyperParamValues<?>> hyperParamValues = getHyperParameterValues(); int valuesPerHyperParam = HyperParams.chooseValuesPerHyperParam(hyperParamValues.size(), candidates); List<List<?>> hyperParameterCombos = HyperParams.chooseHyperParameterCombos(hyperParamValues, candidates, valuesPerHyperParam); Path modelDir = new Path(modelDirString); Path tempModelPath = new Path(modelDir, ".temporary"); Path candidatesPath = new Path(tempModelPath, Long.toString(System.currentTimeMillis())); FileSystem fs = FileSystem.get(modelDir.toUri(), sparkContext.hadoopConfiguration()); fs.mkdirs(candidatesPath); Path bestCandidatePath = findBestCandidatePath(sparkContext, newData, pastData, hyperParameterCombos, candidatesPath); Path finalPath = new Path(modelDir, Long.toString(System.currentTimeMillis())); if (bestCandidatePath == null) { log.info("Unable to build any model"); } else { // Move best model into place fs.rename(bestCandidatePath, finalPath); } // Then delete everything else fs.delete(candidatesPath, true); if (modelUpdateTopic == null) { log.info("No update topic configured, not publishing models to a topic"); } else { // Push PMML model onto update topic, if it exists Path bestModelPath = new Path(finalPath, MODEL_FILE_NAME); if (fs.exists(bestModelPath)) { FileStatus bestModelPathFS = fs.getFileStatus(bestModelPath); PMML bestModel = null; boolean modelNeededForUpdates = canPublishAdditionalModelData(); boolean modelNotTooLarge = bestModelPathFS.getLen() <= maxMessageSize; if (modelNeededForUpdates || modelNotTooLarge) { // Either the model is required for publishAdditionalModelData, or required because it's going to // be serialized to Kafka try (InputStream in = fs.open(bestModelPath)) { bestModel = PMMLUtils.read(in); } } if (modelNotTooLarge) { modelUpdateTopic.send("MODEL", PMMLUtils.toString(bestModel)); } else { modelUpdateTopic.send("MODEL-REF", fs.makeQualified(bestModelPath).toString()); } if (modelNeededForUpdates) { publishAdditionalModelData(sparkContext, bestModel, newData, pastData, finalPath, modelUpdateTopic); } } } if (newData != null) { newData.unpersist(); } if (pastData != null) { pastData.unpersist(); } }
From source file:com.cloudera.oryx.ml.MLUpdate.java
License:Open Source License
private Path findBestCandidatePath(JavaSparkContext sparkContext, JavaRDD<M> newData, JavaRDD<M> pastData, List<List<?>> hyperParameterCombos, Path candidatesPath) throws IOException { Map<Path, Double> pathToEval = ExecUtils.collectInParallel(candidates, Math.min(evalParallelism, candidates), true, i -> buildAndEval(i, hyperParameterCombos, sparkContext, newData, pastData, candidatesPath), Collectors.toMap(Pair::getFirst, Pair::getSecond)); FileSystem fs = null;/*from www . j a va 2 s . c o m*/ Path bestCandidatePath = null; double bestEval = Double.NEGATIVE_INFINITY; for (Map.Entry<Path, Double> pathEval : pathToEval.entrySet()) { Path path = pathEval.getKey(); if (fs == null) { fs = FileSystem.get(path.toUri(), sparkContext.hadoopConfiguration()); } if (path != null && fs.exists(path)) { Double eval = pathEval.getValue(); if (!Double.isNaN(eval)) { // Valid evaluation; if it's the best so far, keep it if (eval > bestEval) { log.info("Best eval / model path is now {} / {}", eval, path); bestEval = eval; bestCandidatePath = path; } } else if (bestCandidatePath == null && testFraction == 0.0) { // Normal case when eval is disabled; no eval is possible, but keep the one model // that was built bestCandidatePath = path; } } // else can't do anything; no model at all } if (threshold != null && bestEval < threshold) { log.info("Best model at {} had eval {}, but did not exceed threshold {}; discarding model", bestCandidatePath, bestEval, threshold); bestCandidatePath = null; } return bestCandidatePath; }
From source file:com.cloudera.oryx.ml.MLUpdate.java
License:Open Source License
private Pair<Path, Double> buildAndEval(int i, List<List<?>> hyperParameterCombos, JavaSparkContext sparkContext, JavaRDD<M> newData, JavaRDD<M> pastData, Path candidatesPath) { // % = cycle through combinations if needed List<?> hyperParameters = hyperParameterCombos.get(i % hyperParameterCombos.size()); Path candidatePath = new Path(candidatesPath, Integer.toString(i)); log.info("Building candidate {} with params {}", i, hyperParameters); Pair<JavaRDD<M>, JavaRDD<M>> trainTestData = splitTrainTest(newData, pastData); JavaRDD<M> allTrainData = trainTestData.getFirst(); JavaRDD<M> testData = trainTestData.getSecond(); Double eval = Double.NaN; if (empty(allTrainData)) { log.info("No train data to build a model"); } else {//from ww w . j av a 2 s. c o m PMML model = buildModel(sparkContext, allTrainData, hyperParameters, candidatePath); if (model == null) { log.info("Unable to build a model"); } else { Path modelPath = new Path(candidatePath, MODEL_FILE_NAME); log.info("Writing model to {}", modelPath); try { FileSystem fs = FileSystem.get(candidatePath.toUri(), sparkContext.hadoopConfiguration()); fs.mkdirs(candidatePath); try (OutputStream out = fs.create(modelPath)) { PMMLUtils.write(model, out); } } catch (IOException ioe) { throw new IllegalStateException(ioe); } if (empty(testData)) { log.info("No test data available to evaluate model"); } else { log.info("Evaluating model"); eval = evaluate(sparkContext, model, candidatePath, testData, allTrainData); } } } log.info("Model eval for params {}: {} ({})", hyperParameters, eval, candidatePath); return new Pair<>(candidatePath, eval); }
From source file:com.cloudera.recordservice.hive.RecordServiceHiveInputFormat.java
License:Apache License
/** * Copied HiveInputFormat/* w ww . j a v a2 s . c o m*/ */ @Override public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { init(job); Path[] dirs = FileInputFormat.getInputPaths(job); if (dirs.length == 0) { throw new IOException("No input paths specified in job"); } JobConf newjob = new JobConf(job); List<InputSplit> result = new ArrayList<InputSplit>(); List<Path> currentDirs = new ArrayList<Path>(); Class<? extends InputFormat> currentInputFormatClass = null; TableDesc currentTable = null; TableScanOperator currentTableScan = null; // for each dir, get the InputFormat, and do getSplits. for (Path dir : dirs) { PartitionDesc part = getPartitionDescFromPath(pathToPartitionInfo, dir); Class<? extends InputFormat> inputFormatClass = part.getInputFileFormatClass(); TableDesc table = part.getTableDesc(); TableScanOperator tableScan = null; List<String> aliases = mrwork_.getPathToAliases().get(dir.toUri().toString()); // Make filter pushdown information available to getSplits. if ((aliases != null) && (aliases.size() == 1)) { Operator op = mrwork_.getAliasToWork().get(aliases.get(0)); if ((op != null) && (op instanceof TableScanOperator)) { tableScan = (TableScanOperator) op; // push down projections. ColumnProjectionUtils.appendReadColumns(newjob, tableScan.getNeededColumnIDs(), tableScan.getNeededColumns()); // push down filters pushFilters(newjob, tableScan); } } if (!currentDirs.isEmpty() && inputFormatClass.equals(currentInputFormatClass) && table.equals(currentTable) && tableScan == currentTableScan) { currentDirs.add(dir); continue; } if (!currentDirs.isEmpty()) { LOG.info("Generating splits"); addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result); } currentDirs.clear(); currentDirs.add(dir); currentTableScan = tableScan; currentTable = table; currentInputFormatClass = inputFormatClass; } if (dirs.length != 0) { LOG.info("Generating splits"); addSplitsForGroup(currentDirs, currentTableScan, newjob, getInputFormatFromCache(currentInputFormatClass, job), currentInputFormatClass, currentDirs.size() * (numSplits / dirs.length), currentTable, result); } LOG.info("number of splits " + result.size()); return result.toArray(new HiveInputSplitShim[result.size()]); }
From source file:com.cloudera.science.ml.client.params.RecordOutputParameters.java
License:Open Source License
private void createHiveTable(Spec spec, String outputPath) throws IOException { if (hiveStr != null) { String dbName = HCatalog.getDbName(hiveStr); String tblName = HCatalog.getTableName(hiveStr); if (HCatalog.tableExists(dbName, tblName)) { LOG.warn("Hive table named " + hiveStr + " already exists"); return; }//from www.jav a 2 s. co m LOG.info("Creating an external Hive table named: " + hiveStr); Table tbl = new Table(dbName, tblName); tbl.setOwner(UserGroupInformation.getCurrentUser().getShortUserName()); tbl.setTableType(TableType.EXTERNAL_TABLE); tbl.setProperty("EXTERNAL", "TRUE"); Path output = FileSystem.get(new Configuration()).makeQualified(new Path(outputPath)); tbl.setDataLocation(output.toUri()); List<FieldSchema> fields = Lists.newArrayList(); for (int i = 0; i < spec.size(); i++) { FieldSpec fs = spec.getField(i); FieldSchema hfs = new FieldSchema(); hfs.setName(fs.name()); switch (fs.spec().getDataType()) { case BOOLEAN: hfs.setType("boolean"); break; case INT: hfs.setType("int"); break; case LONG: hfs.setType("bigint"); break; case DOUBLE: hfs.setType("double"); break; case STRING: hfs.setType("string"); break; } fields.add(hfs); } tbl.setFields(fields); if (FORMAT_AVRO.equals(outputType)) { try { tbl.setSerializationLib("org.apache.hadoop.hive.serde2.avro.AvroSerDe"); tbl.setInputFormatClass("org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat"); tbl.setOutputFormatClass("org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat"); tbl.setProperty("avro.schema.literal", Spec2Schema.create(spec).toString()); } catch (Exception e) { LOG.error("Error configured Hive Avro table, table creation failed", e); return; } } else { // FORMAT_CSV try { tbl.setSerializationLib("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe"); tbl.setSerdeParam("field.delim", delim); tbl.setSerdeParam("serialization.format", ","); tbl.setInputFormatClass("org.apache.hadoop.mapred.TextInputFormat"); tbl.setOutputFormatClass("org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat"); } catch (Exception e) { LOG.error("Error configuring Hive for CSV files, table creation failed", e); return; } } HCatalog.createTable(tbl); } }
From source file:com.conversantmedia.mapreduce.tool.DistributedResourceManager.java
License:Apache License
/** * Register this resource. If the resource is a simple property (i.e. primative or String), * it will place it in the configuration. Otherwise, it uses the distributed cache * mechanism as required./* ww w . ja v a2s.c om*/ * * @param key the resource key. Usually a property/field name. * @param value the resource. * @throws IOException if resource cannot be serialized * */ public void registerResource(String key, Object value) throws IOException { if (value == null) { return; } String valueString = null; // First, determine our approach: if (value instanceof String) { valueString = (String) value; } else if (Primitives.isWrapperType(value.getClass())) { valueString = String.valueOf(value); } // If this is a Path or File object we'll place it // on the distributed cache else if (value instanceof Path) { Path path = (Path) value; valueString = path.getName(); // Distribute the file the new way this.job.addCacheFile(path.toUri()); } else if (value instanceof File) { File file = (File) value; valueString = file.getName(); // Distribute the file distributeLocalFile(file); } // Check if it's serializable else if (value instanceof java.io.Serializable) { // Serialize the object and place it on the distributed cache ObjectOutputStream out = null; try { File beanSerFile = File.createTempFile(value.getClass().getName(), ".ser"); FileOutputStream fileOut = new FileOutputStream(beanSerFile); out = new ObjectOutputStream(fileOut); out.writeObject(value); valueString = beanSerFile.getName(); // Distribute the file distributeLocalFile(beanSerFile); } finally { IOUtils.closeQuietly(out); } } else { throw new IllegalArgumentException("Resource [" + key + "] is not serializable."); } // Setup the config key String configKey = CONFIGKEYBASE_RESOURCE + key; getConf().set(configKey, value.getClass().getName() + VALUE_SEP + valueString); }
From source file:com.conversantmedia.mapreduce.tool.DistributedResourceManager.java
License:Apache License
private Path distributeLocalFile(File file) throws IOException { FileSystem fs = FileSystem.get(getConf()); Path dest = new Path(file.getAbsolutePath()); fs.copyFromLocalFile(true, true, new Path(file.getAbsolutePath()), dest); this.job.addCacheFile(dest.toUri()); return dest;// w w w. j av a 2s . co m }