List of usage examples for org.apache.hadoop.fs Path toString
@Override
public String toString()
From source file:com.cloudera.oryx.app.batch.mllib.als.ALSUpdate.java
License:Open Source License
private static void saveFeaturesRDD(JavaPairRDD<Integer, float[]> features, Path path, Broadcast<Map<Integer, String>> bIndexToID) { log.info("Saving features RDD to {}", path); features.map(keyAndVector -> {//from w ww . j av a 2 s. c o m String id = bIndexToID.value().get(keyAndVector._1()); float[] vector = keyAndVector._2(); return TextUtils.joinJSON(Arrays.asList(id, vector)); }).saveAsTextFile(path.toString(), GzipCodec.class); }
From source file:com.cloudera.oryx.app.batch.mllib.als.ALSUpdate.java
License:Open Source License
private static JavaPairRDD<String, float[]> readFeaturesRDD(JavaSparkContext sparkContext, Path path) { log.info("Loading features RDD from {}", path); JavaRDD<String> featureLines = sparkContext.textFile(path.toString()); return featureLines.mapToPair(line -> { List<?> update = TextUtils.readJSON(line, List.class); String key = update.get(0).toString(); float[] vector = TextUtils.convertViaJSON(update.get(1), float[].class); return new Tuple2<>(key, vector); });// ww w .jav a 2s . co m }
From source file:com.cloudera.oryx.app.mllib.als.ALSUpdate.java
License:Open Source License
private static void saveFeaturesRDD(JavaPairRDD<Integer, double[]> features, Path path, final Map<Integer, String> reverseIDMapping) { log.info("Saving features RDD to {}", path); features.map(new Function<Tuple2<Integer, double[]>, String>() { @Override/*from w ww . ja va 2 s.c om*/ public String call(Tuple2<Integer, double[]> keyAndVector) { Integer id = keyAndVector._1(); String originalKey = reverseIDMapping.get(id); Object key = originalKey == null ? id : originalKey; double[] vector = keyAndVector._2(); return TextUtils.joinJSON(Arrays.asList(key, vector)); } }).saveAsTextFile(path.toString(), GzipCodec.class); }
From source file:com.cloudera.oryx.app.mllib.als.ALSUpdate.java
License:Open Source License
private static JavaPairRDD<String, double[]> readFeaturesRDD(JavaSparkContext sparkContext, Path path) { log.info("Loading features RDD from {}", path); JavaRDD<String> featureLines = sparkContext.textFile(path.toString()); return featureLines.mapToPair(new PairFunction<String, String, double[]>() { @Override// w w w. j a v a2 s .c om public Tuple2<String, double[]> call(String line) throws IOException { List<?> update = MAPPER.readValue(line, List.class); String key = update.get(0).toString(); double[] vector = MAPPER.convertValue(update.get(1), double[].class); return new Tuple2<>(key, vector); } }); }
From source file:com.cloudera.oryx.lambda.batch.BatchLayer.java
License:Open Source License
public synchronized void start() { String id = getID();/* w w w . ja v a 2s.co m*/ if (id != null) { log.info("Starting Batch Layer {}", id); } streamingContext = buildStreamingContext(); JavaSparkContext sparkContext = streamingContext.sparkContext(); Configuration hadoopConf = sparkContext.hadoopConfiguration(); Path checkpointPath = new Path(new Path(modelDirString), ".checkpoint"); log.info("Setting checkpoint dir to {}", checkpointPath); sparkContext.setCheckpointDir(checkpointPath.toString()); log.info("Creating message stream from topic"); JavaInputDStream<MessageAndMetadata<K, M>> kafkaDStream = buildInputDStream(streamingContext); JavaPairDStream<K, M> pairDStream = kafkaDStream .mapToPair(mAndM -> new Tuple2<>(mAndM.key(), mAndM.message())); Class<K> keyClass = getKeyClass(); Class<M> messageClass = getMessageClass(); pairDStream.foreachRDD(new BatchUpdateFunction<>(getConfig(), keyClass, messageClass, keyWritableClass, messageWritableClass, dataDirString, modelDirString, loadUpdateInstance(), streamingContext)); // "Inline" saveAsNewAPIHadoopFiles to be able to skip saving empty RDDs pairDStream.foreachRDD(new SaveToHDFSFunction<>(dataDirString + "/oryx", "data", keyClass, messageClass, keyWritableClass, messageWritableClass, hadoopConf)); // Must use the raw Kafka stream to get offsets kafkaDStream.foreachRDD(new UpdateOffsetsFn<>(getGroupID(), getInputTopicLockMaster())); if (maxDataAgeHours != NO_MAX_AGE) { pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf, dataDirString, Pattern.compile("-(\\d+)\\."), maxDataAgeHours)); } if (maxModelAgeHours != NO_MAX_AGE) { pairDStream.foreachRDD( new DeleteOldDataFn<>(hadoopConf, modelDirString, Pattern.compile("(\\d+)"), maxModelAgeHours)); } log.info("Starting Spark Streaming"); streamingContext.start(); }
From source file:com.cloudera.oryx.lambda.batch.BatchUpdateFunction.java
License:Open Source License
/** * @return paths from {@link FileStatus}es into one comma-separated String * @see FileInputFormat#addInputPath(org.apache.hadoop.mapreduce.Job, Path) *//*w w w . j a v a2 s . co m*/ private static String joinFSPaths(FileSystem fs, FileStatus[] statuses) { StringBuilder joined = new StringBuilder(); for (FileStatus status : statuses) { if (joined.length() > 0) { joined.append(','); } Path path = fs.makeQualified(status.getPath()); joined.append(StringUtils.escapeString(path.toString())); } return joined.toString(); }
From source file:com.cloudera.oryx.ml.mllib.als.ALSUpdate.java
License:Open Source License
private static void saveFeaturesRDD(RDD<Tuple2<Object, double[]>> features, Path path) { log.info("Saving features RDD to {}", path); fromRDD(features).map(new Function<Tuple2<Object, double[]>, String>() { @Override/*from www . j av a2 s . co m*/ public String call(Tuple2<Object, double[]> keyAndVector) throws IOException { Object key = keyAndVector._1(); double[] vector = keyAndVector._2(); return MAPPER.writeValueAsString(Arrays.asList(key, vector)); } }).saveAsTextFile(path.toString(), GzipCodec.class); }
From source file:com.cloudera.oryx.ml.mllib.als.ALSUpdate.java
License:Open Source License
private static RDD<Tuple2<Integer, double[]>> readFeaturesRDD(JavaSparkContext sparkContext, Path path) { log.info("Loading features RDD from {}", path); JavaRDD<String> featureLines = sparkContext.textFile(path.toString()); return featureLines.map(new Function<String, Tuple2<Integer, double[]>>() { @Override/*from www .ja v a 2 s . co m*/ public Tuple2<Integer, double[]> call(String line) throws IOException { List<?> update = MAPPER.readValue(line, List.class); Integer key = Integer.valueOf(update.get(0).toString()); double[] vector = MAPPER.convertValue(update.get(1), double[].class); return new Tuple2<>(key, vector); } }).rdd(); }
From source file:com.cloudera.recordbreaker.analyzer.FSAnalyzer.java
License:Open Source License
public String checkHiveSupport(final Path fpath) { return dbQueue.execute(new SQLiteJob<String>() { protected String job(SQLiteConnection db) throws SQLiteException { SQLiteStatement stmt = db.prepare("SELECT hiveTableName FROM HiveTables WHERE fpath = ?"); try { stmt.bind(1, fpath.toString()); while (stmt.step()) { return stmt.columnString(0); }/*from w w w .j a va2s . c o m*/ return null; } finally { stmt.dispose(); } } }).complete(); }
From source file:com.cloudera.recordbreaker.analyzer.FSAnalyzer.java
License:Open Source License
public void addHiveSupport(final Path fpath, final String tablename) { dbQueue.execute(new SQLiteJob<Object>() { protected Object job(SQLiteConnection db) throws SQLiteException { SQLiteStatement stmt = db.prepare("INSERT into HiveTables VALUES(?, ?)"); try { stmt.bind(1, fpath.toString()); stmt.bind(2, tablename); stmt.step();/*from w ww . ja va2s . c om*/ return null; } finally { stmt.dispose(); } } }).complete(); }