Example usage for org.apache.hadoop.fs FileSystem get

List of usage examples for org.apache.hadoop.fs FileSystem get

Introduction

In this page you can find the example usage for org.apache.hadoop.fs FileSystem get.

Prototype

public static FileSystem get(Configuration conf) throws IOException 

Source Link

Document

Returns the configured FileSystem implementation.

Usage

From source file:MapFileRW.java

License:Open Source License

static void write() throws IOException {
    String filename = "/indextest/testmapfile";
    Configuration conf = new Configuration();
    MapFile.Writer writer = new MapFile.Writer(conf, FileSystem.get(conf), filename, IndexKey.class,
            IndexValue.class);

    writer.close();//  www  .  j av a 2  s  .co m
}

From source file:MapFileRW.java

License:Open Source License

static void read(String filename, int num) throws Exception {
    Configuration conf = new Configuration();
    FileSystem fs = FileSystem.get(conf);
    MapFile.Reader reader = new MapFile.Reader(fs, filename, conf);
    IndexKey key = new IndexKey();
    IndexValue value = new IndexValue();
    int i = 0;/*  www.  j a v  a 2  s .  c  om*/
    while (reader.next(key, value)) {
        System.out.print(value.getFileindex() + " ");
        System.out.println(value.getRowid());
        key.reset();
        if ((i++) >= num)
            break;

    }
}

From source file:SingleFileWriter.java

License:Apache License

public int run(String[] args) throws Exception {
    if (args.length < 1) {
        System.err.println("SingleFileWriter [fileSize ie. 1g/10g/100g]");
        return 1;
    }// w w  w. j a v  a 2  s . c  om

    double fileSize = Double.parseDouble((args[0].split("g|G"))[0]) * 1024 * 1024 * 1024;

    String hdfsFolder = "/hdfs_test/";
    String hdfsFile = hdfsFolder + args[0];
    short replication = 1;
    boolean overWrite = true;
    int bufferSize = 65536;
    int blockSize = 536870912;
    double numIters = fileSize / (double) bufferSize;

    /* Initialize byte buffer */
    ByteBuffer buf = ByteBuffer.allocate(bufferSize);
    buf.order(ByteOrder.nativeOrder());
    for (int k = 0; k < bufferSize / Integer.SIZE; k++) {
        buf.putInt(k);
    }
    buf.flip();

    /* Create file on HDFS */
    Configuration conf = getConf();
    FileSystem fs = FileSystem.get(conf);
    Path hdfsFilePath = new Path(hdfsFile);
    OutputStream os = fs.create(hdfsFilePath, overWrite, bufferSize, replication, blockSize);
    /* Write the content of the byte buffer 
     to the HDFS file*/
    Timer t = new Timer();
    t.start(0);
    for (long i = 0; i < numIters; i++) {
        os.write(buf.array());
        buf.flip();
    }
    t.end(0);
    os.close();
    fs.delete(hdfsFilePath, true);

    t.dump();

    return 0;
}

From source file:DumpPageRankRecordsToPlainText.java

License:Apache License

/**
 * Runs this tool.//  w w w .jav a2s.  com
 */
@SuppressWarnings({ "static-access" })
public int run(String[] args) throws Exception {
    Options options = new Options();

    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT));
    options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT));

    CommandLine cmdline;
    CommandLineParser parser = new GnuParser();

    try {
        cmdline = parser.parse(options, args);
    } catch (ParseException exp) {
        System.err.println("Error parsing command line: " + exp.getMessage());
        return -1;
    }

    if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) {
        System.out.println("args: " + Arrays.toString(args));
        HelpFormatter formatter = new HelpFormatter();
        formatter.setWidth(120);
        formatter.printHelp(this.getClass().getName(), options);
        ToolRunner.printGenericCommandUsage(System.out);
        return -1;
    }

    String inputPath = cmdline.getOptionValue(INPUT);
    String outputPath = cmdline.getOptionValue(OUTPUT);

    LOG.info("Tool name: " + DumpPageRankRecordsToPlainText.class.getSimpleName());
    LOG.info(" - input: " + inputPath);
    LOG.info(" - output: " + outputPath);

    Configuration conf = new Configuration();
    conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024);

    Job job = Job.getInstance(conf);
    job.setJobName(DumpPageRankRecordsToPlainText.class.getSimpleName());
    job.setJarByClass(DumpPageRankRecordsToPlainText.class);

    job.setNumReduceTasks(0);

    FileInputFormat.addInputPath(job, new Path(inputPath));
    FileOutputFormat.setOutputPath(job, new Path(outputPath));

    job.setInputFormatClass(SequenceFileInputFormat.class);
    job.setOutputFormatClass(TextOutputFormat.class);

    job.setMapOutputKeyClass(IntWritable.class);
    job.setMapOutputValueClass(PageRankNode.class);

    // Delete the output directory if it exists already.
    FileSystem.get(conf).delete(new Path(outputPath), true);

    job.waitForCompletion(true);

    return 0;
}

From source file:HoodieJavaApp.java

License:Apache License

public void run() throws Exception {

    // Spark session setup..
    SparkSession spark = SparkSession.builder().appName("Hoodie Spark APP")
            .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]")
            .getOrCreate();/*from ww w  .j a va  2  s .  co m*/
    JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext());
    FileSystem fs = FileSystem.get(jssc.hadoopConfiguration());

    // Generator of some records to be loaded in.
    HoodieTestDataGenerator dataGen = null;
    if (nonPartitionedTable) {
        // All data goes to base-path
        dataGen = new HoodieTestDataGenerator(new String[] { "" });
    } else {
        dataGen = new HoodieTestDataGenerator();
    }

    /**
     * Commit with only inserts
     */
    // Generate some input..
    List<String> records1 = DataSourceTestUtils
            .convertToStringList(dataGen.generateInserts("001"/* ignore */, 100));
    Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2));

    // Save as hoodie dataset (copy on write)
    DataFrameWriter<Row> writer = inputDF1.write().format("com.uber.hoodie") // specify the hoodie source
            .option("hoodie.insert.shuffle.parallelism", "2") // any hoodie client config can be passed like this
            .option("hoodie.upsert.shuffle.parallelism", "2") // full list in HoodieWriteConfig & its package
            .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type
            .option(DataSourceWriteOptions.OPERATION_OPT_KEY(),
                    DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) // insert
            .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") // This is the record key
            .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") // this is the partition to place it into
            .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") // use to combine duplicate records in input/with disk val
            .option(HoodieWriteConfig.TABLE_NAME, tableName) // Used by hive sync and queries
            .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(),
                    nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName()
                            : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor
            .mode(SaveMode.Overwrite); // This will remove any existing data at path below, and create a

    updateHiveSyncConfig(writer);
    // new dataset if needed
    writer.save(tablePath); // ultimately where the dataset will be placed
    String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
    logger.info("First commit at instant time :" + commitInstantTime1);

    /**
     * Commit that updates records
     */
    List<String> records2 = DataSourceTestUtils
            .convertToStringList(dataGen.generateUpdates("002"/* ignore */, 100));
    Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2));
    writer = inputDF2.write().format("com.uber.hoodie").option("hoodie.insert.shuffle.parallelism", "2")
            .option("hoodie.upsert.shuffle.parallelism", "2")
            .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type
            .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key")
            .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition")
            .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp")
            .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(),
                    nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName()
                            : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor
            .option(HoodieWriteConfig.TABLE_NAME, tableName).mode(SaveMode.Append);

    updateHiveSyncConfig(writer);
    writer.save(tablePath);
    String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath);
    logger.info("Second commit at instant time :" + commitInstantTime1);

    /**
     * Read & do some queries
     */
    Dataset<Row> hoodieROViewDF = spark.read().format("com.uber.hoodie")
            // pass any path glob, can include hoodie & non-hoodie
            // datasets
            .load(tablePath + (nonPartitionedTable ? "/*" : "/*/*/*/*"));
    hoodieROViewDF.registerTempTable("hoodie_ro");
    spark.sql("describe hoodie_ro").show();
    // all trips whose fare was greater than 2.
    spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0").show();

    if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) {
        /**
         * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE
         */
        Dataset<Row> hoodieIncViewDF = spark.read().format("com.uber.hoodie")
                .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(),
                        DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL())
                .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1) // Only changes in write 2 above
                .load(tablePath); // For incremental view, pass in the root/base path of dataset

        logger.info("You will only see records from : " + commitInstantTime2);
        hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show();
    }
}

From source file:PostgresToSeq.java

License:Apache License

public static void main(String args[]) throws Exception {
    if (args.length != 2) {
        System.err.println("Arguments: [input postgres table] [output sequence file]");
        return;//w  w  w.ja v a2 s.  com
    }
    String inputFileName = args[0];
    String outputDirName = args[1];
    Configuration configuration = new Configuration();
    FileSystem fs = FileSystem.get(configuration);
    Writer writer = new SequenceFile.Writer(fs, configuration, new Path(outputDirName + "/chunk-0"), Text.class,
            Text.class);
    Connection c = null;
    Statement stmt = null;
    try {
        Class.forName("org.postgresql.Driver");
        c = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl");
        c.setAutoCommit(false);
        System.out.println("Opened database successfully");
        stmt = c.createStatement();
        ResultSet rs = stmt.executeQuery("SELECT * FROM " + inputFileName);
        int count = 0;
        Text key = new Text();
        Text value = new Text();

        while (rs.next()) {
            String seq = rs.getString("seq");
            String rep = rs.getString("rep");
            String body = rs.getString("body");
            String category = rep;
            String id = seq;
            String message = body;
            key.set("/" + category + "/" + id);
            value.set(message);
            writer.append(key, value);
            count++;
        }
        rs.close();
        stmt.close();
        c.close();
        writer.close();
        System.out.println("Wrote " + count + " entries.");
    } catch (Exception e) {
        System.err.println(e.getClass().getName() + ": " + e.getMessage());
        System.exit(0);
    }
}

From source file:ClassifierHD.java

License:Apache License

public static void main(String[] args) throws Exception {
    if (args.length < 5) {
        System.out.println(/*from  w w w  .j av  a  2  s .c o m*/
                "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]");
        return;
    }
    String modelPath = args[0];
    String labelIndexPath = args[1];
    String dictionaryPath = args[2];
    String documentFrequencyPath = args[3];
    String tablename = args[4];
    String inputDir = args[5];

    Configuration configuration = new Configuration();

    // model is a matrix (wordId, labelId) => probability score
    NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);

    StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);

    // labels is a map label => classId
    Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
    Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
    Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration,
            new Path(documentFrequencyPath));

    // analyzer used to extract word from tweet
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43);

    int labelCount = labels.size();
    int documentCount = documentFrequency.get(-1).intValue();

    System.out.println("Number of labels: " + labelCount);
    System.out.println("Number of documents in training set: " + documentCount);

    Connection conn = null;
    PreparedStatement pstmt = null;

    try {
        Class.forName("org.postgresql.Driver");
        conn = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres",
                "dbwpsdkdl");
        conn.setAutoCommit(false);
        String sql = "INSERT INTO " + tablename
                + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);";
        pstmt = conn.prepareStatement(sql);

        FileSystem fs = FileSystem.get(configuration);
        FileStatus[] status = fs.listStatus(new Path(inputDir));
        BufferedWriter bw = new BufferedWriter(
                new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true)));

        for (int i = 0; i < status.length; i++) {
            BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath())));
            if (new String(status[i].getPath().getName()).equals("rep.list")) {
                continue;
            }
            int lv_HEAD = 1;
            int lv_cnt = 0;
            String lv_gtime = null;
            String lv_wtime = null;
            String lv_target = null;
            BigDecimal lv_num = null;
            String lv_link = null;
            String[] lv_args;
            String lv_line;
            StringBuilder lv_txt = new StringBuilder();
            while ((lv_line = br.readLine()) != null) {
                if (lv_cnt < lv_HEAD) {
                    lv_args = lv_line.split(",");
                    lv_gtime = lv_args[0];
                    lv_wtime = lv_args[1];
                    lv_target = lv_args[2];
                    lv_num = new BigDecimal(lv_args[3]);
                    lv_link = lv_args[4];
                } else {
                    lv_txt.append(lv_line + '\n');
                }
                lv_cnt++;
            }
            br.close();

            String id = status[i].getPath().getName();
            String message = lv_txt.toString();

            Multiset<String> words = ConcurrentHashMultiset.create();

            TokenStream ts = analyzer.tokenStream("text", new StringReader(message));
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            int wordCount = 0;
            while (ts.incrementToken()) {
                if (termAtt.length() > 0) {
                    String word = ts.getAttribute(CharTermAttribute.class).toString();
                    Integer wordId = dictionary.get(word);
                    if (wordId != null) {
                        words.add(word);
                        wordCount++;
                    }
                }
            }

            ts.end();
            ts.close();

            Vector vector = new RandomAccessSparseVector(10000);
            TFIDF tfidf = new TFIDF();
            for (Multiset.Entry<String> entry : words.entrySet()) {
                String word = entry.getElement();
                int count = entry.getCount();
                Integer wordId = dictionary.get(word);
                Long freq = documentFrequency.get(wordId);
                double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
                vector.setQuick(wordId, tfIdfValue);
            }
            Vector resultVector = classifier.classifyFull(vector);
            double bestScore = -Double.MAX_VALUE;
            int bestCategoryId = -1;
            for (Element element : resultVector.all()) {
                int categoryId = element.index();
                double score = element.get();
                if (score > bestScore) {
                    bestScore = score;
                    bestCategoryId = categoryId;
                }
            }
            //System.out.println(message);
            //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId));
            pstmt.setString(1, id);
            pstmt.setString(2, lv_gtime);
            pstmt.setString(3, lv_wtime);
            pstmt.setString(4, lv_target);
            pstmt.setBigDecimal(5, lv_num);
            pstmt.setString(6, lv_link);
            pstmt.setString(7, message.substring(1, Math.min(50, message.length())));
            pstmt.setString(8, labels.get(bestCategoryId));
            pstmt.addBatch();
            bw.write(id + "\t" + labels.get(bestCategoryId) + "\n");
        }
        pstmt.executeBatch();
        //pstmt.clearParameters();
        pstmt.close();
        conn.commit();
        conn.close();
        bw.close();
    } catch (Exception e) {
        System.err.println(e.getClass().getName() + ": " + e.getMessage());
        System.exit(0);
    }
    analyzer.close();
}

From source file:ColumnStorageBasicTest.java

License:Open Source License

public void testLoadNaviFromHead() {
    try {// w  w w.j a va  2s  . com
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);
        Path path = new Path(prefix);

        fs.delete(path, true);
        createAllSingleProject(fs);

        ColumnProject cp = new ColumnProject(conf);
        cp.loadColmnInfoFromHeadInfo(fs, path);

        checkAllColumnInfo(cp.infos());
    } catch (Exception e) {
        e.printStackTrace();
        fail("get exception:" + e.getMessage());
    }
}

From source file:ColumnStorageBasicTest.java

License:Open Source License

public void testGetFileNameByFieldIndex() {
    try {/*w ww  .ja v  a 2 s . com*/
        String navigator = prefix;
        Path path = new Path(navigator);
        Configuration conf = new Configuration();
        FileSystem fs = FileSystem.get(conf);

        ColumnProject cp = new ColumnProject(conf);
        cp.loadColmnInfoFromHeadInfo(fs, path);

        ArrayList<String> fileList = null;
        fileList = cp.getFileNameByIndex(null);
        if (fileList != null) {
            fail("should null");
        }

        ArrayList<Short> idxs = new ArrayList<Short>(10);
        idxs.add((short) 10);
        fileList = cp.getFileNameByIndex(idxs);
        if (fileList != null) {
            fail("should null");
        }

        idxs.clear();
        idxs.add((short) 0);
        fileList = cp.getFileNameByIndex(idxs);
        if (fileList == null) {
            fail("should not null");
        }
        if (fileList.size() != 1) {
            fail("error fileList:" + fileList.size());
        }
        if (!fileList.get(0).equals(byteFileName)) {
            fail("error file name:" + fileList.get(0));
        }

        idxs.clear();
        idxs.add((short) 0);
        idxs.add((short) 5);
        fileList = cp.getFileNameByIndex(idxs);
        if (fileList == null) {
            fail("should not null");
        }
        if (fileList.size() != 2) {
            fail("error fileList:" + fileList.size());
        }
        if (!fileList.get(0).equals(byteFileName)) {
            fail("error file name1:" + fileList.get(0));
        }
        if (!fileList.get(1).equals(doubleFileName)) {
            fail("error file name2:" + fileList.get(1));
        }

        idxs.clear();
        idxs.add((short) 0);
        idxs.add((short) 5);
        idxs.add((short) 10);
        fileList = cp.getFileNameByIndex(idxs);
        if (fileList != null) {
            fail("should null");
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail("get exception:" + e.getMessage());
    }
}

From source file:ColumnStorageBasicTest.java

License:Open Source License

public void testConstructorFieldNoExist() {
    try {//from w ww  .  j av a 2 s. c  om
        Configuration conf = new Configuration();
        Path path = new Path(prefix);

        FileSystem fs = FileSystem.get(conf);
        fs.delete(path, true);

        createAllSingleProject(fs);
        createMultiProject(fs);

        ArrayList<Short> idxs = new ArrayList<Short>(10);
        idxs.add((short) 10);
        ColumnStorageClient client = new ColumnStorageClient(path, idxs, conf);

        fail("should get exception");
    } catch (SEException.InvalidParameterException e) {

    } catch (Exception e) {
        e.printStackTrace();
        fail("get exception:" + e.getMessage());
    }
}