Example usage for org.apache.hadoop.fs Path toString

List of usage examples for org.apache.hadoop.fs Path toString

Introduction

In this page you can find the example usage for org.apache.hadoop.fs Path toString.

Prototype

@Override
    public String toString() 

Source Link

Usage

From source file:com.cloudera.oryx.app.batch.mllib.als.ALSUpdate.java

License:Open Source License

private static void saveFeaturesRDD(JavaPairRDD<Integer, float[]> features, Path path,
        Broadcast<Map<Integer, String>> bIndexToID) {
    log.info("Saving features RDD to {}", path);
    features.map(keyAndVector -> {//from  w ww .  j av  a 2  s. c o m
        String id = bIndexToID.value().get(keyAndVector._1());
        float[] vector = keyAndVector._2();
        return TextUtils.joinJSON(Arrays.asList(id, vector));
    }).saveAsTextFile(path.toString(), GzipCodec.class);
}

From source file:com.cloudera.oryx.app.batch.mllib.als.ALSUpdate.java

License:Open Source License

private static JavaPairRDD<String, float[]> readFeaturesRDD(JavaSparkContext sparkContext, Path path) {
    log.info("Loading features RDD from {}", path);
    JavaRDD<String> featureLines = sparkContext.textFile(path.toString());
    return featureLines.mapToPair(line -> {
        List<?> update = TextUtils.readJSON(line, List.class);
        String key = update.get(0).toString();
        float[] vector = TextUtils.convertViaJSON(update.get(1), float[].class);
        return new Tuple2<>(key, vector);
    });//  ww  w .jav a 2s  . co  m
}

From source file:com.cloudera.oryx.app.mllib.als.ALSUpdate.java

License:Open Source License

private static void saveFeaturesRDD(JavaPairRDD<Integer, double[]> features, Path path,
        final Map<Integer, String> reverseIDMapping) {
    log.info("Saving features RDD to {}", path);
    features.map(new Function<Tuple2<Integer, double[]>, String>() {
        @Override/*from  w  ww . ja  va  2  s.c om*/
        public String call(Tuple2<Integer, double[]> keyAndVector) {
            Integer id = keyAndVector._1();
            String originalKey = reverseIDMapping.get(id);
            Object key = originalKey == null ? id : originalKey;
            double[] vector = keyAndVector._2();
            return TextUtils.joinJSON(Arrays.asList(key, vector));
        }
    }).saveAsTextFile(path.toString(), GzipCodec.class);
}

From source file:com.cloudera.oryx.app.mllib.als.ALSUpdate.java

License:Open Source License

private static JavaPairRDD<String, double[]> readFeaturesRDD(JavaSparkContext sparkContext, Path path) {
    log.info("Loading features RDD from {}", path);
    JavaRDD<String> featureLines = sparkContext.textFile(path.toString());
    return featureLines.mapToPair(new PairFunction<String, String, double[]>() {
        @Override//  w w  w. j  a v a2  s .c om
        public Tuple2<String, double[]> call(String line) throws IOException {
            List<?> update = MAPPER.readValue(line, List.class);
            String key = update.get(0).toString();
            double[] vector = MAPPER.convertValue(update.get(1), double[].class);
            return new Tuple2<>(key, vector);
        }
    });
}

From source file:com.cloudera.oryx.lambda.batch.BatchLayer.java

License:Open Source License

public synchronized void start() {
    String id = getID();/*  w w w .  ja  v  a 2s.co  m*/
    if (id != null) {
        log.info("Starting Batch Layer {}", id);
    }

    streamingContext = buildStreamingContext();
    JavaSparkContext sparkContext = streamingContext.sparkContext();
    Configuration hadoopConf = sparkContext.hadoopConfiguration();

    Path checkpointPath = new Path(new Path(modelDirString), ".checkpoint");
    log.info("Setting checkpoint dir to {}", checkpointPath);
    sparkContext.setCheckpointDir(checkpointPath.toString());

    log.info("Creating message stream from topic");
    JavaInputDStream<MessageAndMetadata<K, M>> kafkaDStream = buildInputDStream(streamingContext);
    JavaPairDStream<K, M> pairDStream = kafkaDStream
            .mapToPair(mAndM -> new Tuple2<>(mAndM.key(), mAndM.message()));

    Class<K> keyClass = getKeyClass();
    Class<M> messageClass = getMessageClass();
    pairDStream.foreachRDD(new BatchUpdateFunction<>(getConfig(), keyClass, messageClass, keyWritableClass,
            messageWritableClass, dataDirString, modelDirString, loadUpdateInstance(), streamingContext));

    // "Inline" saveAsNewAPIHadoopFiles to be able to skip saving empty RDDs
    pairDStream.foreachRDD(new SaveToHDFSFunction<>(dataDirString + "/oryx", "data", keyClass, messageClass,
            keyWritableClass, messageWritableClass, hadoopConf));

    // Must use the raw Kafka stream to get offsets
    kafkaDStream.foreachRDD(new UpdateOffsetsFn<>(getGroupID(), getInputTopicLockMaster()));

    if (maxDataAgeHours != NO_MAX_AGE) {
        pairDStream.foreachRDD(new DeleteOldDataFn<>(hadoopConf, dataDirString, Pattern.compile("-(\\d+)\\."),
                maxDataAgeHours));
    }
    if (maxModelAgeHours != NO_MAX_AGE) {
        pairDStream.foreachRDD(
                new DeleteOldDataFn<>(hadoopConf, modelDirString, Pattern.compile("(\\d+)"), maxModelAgeHours));
    }

    log.info("Starting Spark Streaming");

    streamingContext.start();
}

From source file:com.cloudera.oryx.lambda.batch.BatchUpdateFunction.java

License:Open Source License

/**
 * @return paths from {@link FileStatus}es into one comma-separated String
 * @see FileInputFormat#addInputPath(org.apache.hadoop.mapreduce.Job, Path)
 *//*w w w . j a  v  a2  s  .  co m*/
private static String joinFSPaths(FileSystem fs, FileStatus[] statuses) {
    StringBuilder joined = new StringBuilder();
    for (FileStatus status : statuses) {
        if (joined.length() > 0) {
            joined.append(',');
        }
        Path path = fs.makeQualified(status.getPath());
        joined.append(StringUtils.escapeString(path.toString()));
    }
    return joined.toString();
}

From source file:com.cloudera.oryx.ml.mllib.als.ALSUpdate.java

License:Open Source License

private static void saveFeaturesRDD(RDD<Tuple2<Object, double[]>> features, Path path) {
    log.info("Saving features RDD to {}", path);
    fromRDD(features).map(new Function<Tuple2<Object, double[]>, String>() {
        @Override/*from www .  j av a2  s .  co m*/
        public String call(Tuple2<Object, double[]> keyAndVector) throws IOException {
            Object key = keyAndVector._1();
            double[] vector = keyAndVector._2();
            return MAPPER.writeValueAsString(Arrays.asList(key, vector));
        }
    }).saveAsTextFile(path.toString(), GzipCodec.class);
}

From source file:com.cloudera.oryx.ml.mllib.als.ALSUpdate.java

License:Open Source License

private static RDD<Tuple2<Integer, double[]>> readFeaturesRDD(JavaSparkContext sparkContext, Path path) {
    log.info("Loading features RDD from {}", path);
    JavaRDD<String> featureLines = sparkContext.textFile(path.toString());
    return featureLines.map(new Function<String, Tuple2<Integer, double[]>>() {
        @Override/*from   www  .ja  v  a  2 s  .  co  m*/
        public Tuple2<Integer, double[]> call(String line) throws IOException {
            List<?> update = MAPPER.readValue(line, List.class);
            Integer key = Integer.valueOf(update.get(0).toString());
            double[] vector = MAPPER.convertValue(update.get(1), double[].class);
            return new Tuple2<>(key, vector);
        }
    }).rdd();
}

From source file:com.cloudera.recordbreaker.analyzer.FSAnalyzer.java

License:Open Source License

public String checkHiveSupport(final Path fpath) {
    return dbQueue.execute(new SQLiteJob<String>() {
        protected String job(SQLiteConnection db) throws SQLiteException {
            SQLiteStatement stmt = db.prepare("SELECT hiveTableName FROM HiveTables WHERE fpath = ?");
            try {
                stmt.bind(1, fpath.toString());
                while (stmt.step()) {
                    return stmt.columnString(0);
                }/*from  w w w .j a va2s  . c  o  m*/
                return null;
            } finally {
                stmt.dispose();
            }
        }
    }).complete();
}

From source file:com.cloudera.recordbreaker.analyzer.FSAnalyzer.java

License:Open Source License

public void addHiveSupport(final Path fpath, final String tablename) {
    dbQueue.execute(new SQLiteJob<Object>() {
        protected Object job(SQLiteConnection db) throws SQLiteException {
            SQLiteStatement stmt = db.prepare("INSERT into HiveTables VALUES(?, ?)");
            try {
                stmt.bind(1, fpath.toString());
                stmt.bind(2, tablename);
                stmt.step();/*from w ww . ja va2s . c om*/
                return null;
            } finally {
                stmt.dispose();
            }
        }
    }).complete();
}