List of usage examples for org.apache.hadoop.fs FileSystem get
public static FileSystem get(Configuration conf) throws IOException
From source file:MapFileRW.java
License:Open Source License
static void write() throws IOException { String filename = "/indextest/testmapfile"; Configuration conf = new Configuration(); MapFile.Writer writer = new MapFile.Writer(conf, FileSystem.get(conf), filename, IndexKey.class, IndexValue.class); writer.close();// www . j av a 2 s .co m }
From source file:MapFileRW.java
License:Open Source License
static void read(String filename, int num) throws Exception { Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); MapFile.Reader reader = new MapFile.Reader(fs, filename, conf); IndexKey key = new IndexKey(); IndexValue value = new IndexValue(); int i = 0;/* www. j a v a 2 s . c om*/ while (reader.next(key, value)) { System.out.print(value.getFileindex() + " "); System.out.println(value.getRowid()); key.reset(); if ((i++) >= num) break; } }
From source file:SingleFileWriter.java
License:Apache License
public int run(String[] args) throws Exception { if (args.length < 1) { System.err.println("SingleFileWriter [fileSize ie. 1g/10g/100g]"); return 1; }// w w w. j a v a 2 s . c om double fileSize = Double.parseDouble((args[0].split("g|G"))[0]) * 1024 * 1024 * 1024; String hdfsFolder = "/hdfs_test/"; String hdfsFile = hdfsFolder + args[0]; short replication = 1; boolean overWrite = true; int bufferSize = 65536; int blockSize = 536870912; double numIters = fileSize / (double) bufferSize; /* Initialize byte buffer */ ByteBuffer buf = ByteBuffer.allocate(bufferSize); buf.order(ByteOrder.nativeOrder()); for (int k = 0; k < bufferSize / Integer.SIZE; k++) { buf.putInt(k); } buf.flip(); /* Create file on HDFS */ Configuration conf = getConf(); FileSystem fs = FileSystem.get(conf); Path hdfsFilePath = new Path(hdfsFile); OutputStream os = fs.create(hdfsFilePath, overWrite, bufferSize, replication, blockSize); /* Write the content of the byte buffer to the HDFS file*/ Timer t = new Timer(); t.start(0); for (long i = 0; i < numIters; i++) { os.write(buf.array()); buf.flip(); } t.end(0); os.close(); fs.delete(hdfsFilePath, true); t.dump(); return 0; }
From source file:DumpPageRankRecordsToPlainText.java
License:Apache License
/** * Runs this tool.// w w w .jav a2s. com */ @SuppressWarnings({ "static-access" }) public int run(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("input path").create(INPUT)); options.addOption(OptionBuilder.withArgName("path").hasArg().withDescription("output path").create(OUTPUT)); CommandLine cmdline; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); return -1; } if (!cmdline.hasOption(INPUT) || !cmdline.hasOption(OUTPUT)) { System.out.println("args: " + Arrays.toString(args)); HelpFormatter formatter = new HelpFormatter(); formatter.setWidth(120); formatter.printHelp(this.getClass().getName(), options); ToolRunner.printGenericCommandUsage(System.out); return -1; } String inputPath = cmdline.getOptionValue(INPUT); String outputPath = cmdline.getOptionValue(OUTPUT); LOG.info("Tool name: " + DumpPageRankRecordsToPlainText.class.getSimpleName()); LOG.info(" - input: " + inputPath); LOG.info(" - output: " + outputPath); Configuration conf = new Configuration(); conf.setInt("mapred.min.split.size", 1024 * 1024 * 1024); Job job = Job.getInstance(conf); job.setJobName(DumpPageRankRecordsToPlainText.class.getSimpleName()); job.setJarByClass(DumpPageRankRecordsToPlainText.class); job.setNumReduceTasks(0); FileInputFormat.addInputPath(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, new Path(outputPath)); job.setInputFormatClass(SequenceFileInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setMapOutputKeyClass(IntWritable.class); job.setMapOutputValueClass(PageRankNode.class); // Delete the output directory if it exists already. FileSystem.get(conf).delete(new Path(outputPath), true); job.waitForCompletion(true); return 0; }
From source file:HoodieJavaApp.java
License:Apache License
public void run() throws Exception { // Spark session setup.. SparkSession spark = SparkSession.builder().appName("Hoodie Spark APP") .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").master("local[1]") .getOrCreate();/*from ww w .j a va 2 s . co m*/ JavaSparkContext jssc = new JavaSparkContext(spark.sparkContext()); FileSystem fs = FileSystem.get(jssc.hadoopConfiguration()); // Generator of some records to be loaded in. HoodieTestDataGenerator dataGen = null; if (nonPartitionedTable) { // All data goes to base-path dataGen = new HoodieTestDataGenerator(new String[] { "" }); } else { dataGen = new HoodieTestDataGenerator(); } /** * Commit with only inserts */ // Generate some input.. List<String> records1 = DataSourceTestUtils .convertToStringList(dataGen.generateInserts("001"/* ignore */, 100)); Dataset<Row> inputDF1 = spark.read().json(jssc.parallelize(records1, 2)); // Save as hoodie dataset (copy on write) DataFrameWriter<Row> writer = inputDF1.write().format("com.uber.hoodie") // specify the hoodie source .option("hoodie.insert.shuffle.parallelism", "2") // any hoodie client config can be passed like this .option("hoodie.upsert.shuffle.parallelism", "2") // full list in HoodieWriteConfig & its package .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type .option(DataSourceWriteOptions.OPERATION_OPT_KEY(), DataSourceWriteOptions.INSERT_OPERATION_OPT_VAL()) // insert .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") // This is the record key .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") // this is the partition to place it into .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") // use to combine duplicate records in input/with disk val .option(HoodieWriteConfig.TABLE_NAME, tableName) // Used by hive sync and queries .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor .mode(SaveMode.Overwrite); // This will remove any existing data at path below, and create a updateHiveSyncConfig(writer); // new dataset if needed writer.save(tablePath); // ultimately where the dataset will be placed String commitInstantTime1 = HoodieDataSourceHelpers.latestCommit(fs, tablePath); logger.info("First commit at instant time :" + commitInstantTime1); /** * Commit that updates records */ List<String> records2 = DataSourceTestUtils .convertToStringList(dataGen.generateUpdates("002"/* ignore */, 100)); Dataset<Row> inputDF2 = spark.read().json(jssc.parallelize(records2, 2)); writer = inputDF2.write().format("com.uber.hoodie").option("hoodie.insert.shuffle.parallelism", "2") .option("hoodie.upsert.shuffle.parallelism", "2") .option(DataSourceWriteOptions.STORAGE_TYPE_OPT_KEY(), tableType) // Hoodie Table Type .option(DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY(), "_row_key") .option(DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY(), "partition") .option(DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY(), "timestamp") .option(DataSourceWriteOptions.KEYGENERATOR_CLASS_OPT_KEY(), nonPartitionedTable ? NonpartitionedKeyGenerator.class.getCanonicalName() : SimpleKeyGenerator.class.getCanonicalName()) // Add Key Extractor .option(HoodieWriteConfig.TABLE_NAME, tableName).mode(SaveMode.Append); updateHiveSyncConfig(writer); writer.save(tablePath); String commitInstantTime2 = HoodieDataSourceHelpers.latestCommit(fs, tablePath); logger.info("Second commit at instant time :" + commitInstantTime1); /** * Read & do some queries */ Dataset<Row> hoodieROViewDF = spark.read().format("com.uber.hoodie") // pass any path glob, can include hoodie & non-hoodie // datasets .load(tablePath + (nonPartitionedTable ? "/*" : "/*/*/*/*")); hoodieROViewDF.registerTempTable("hoodie_ro"); spark.sql("describe hoodie_ro").show(); // all trips whose fare was greater than 2. spark.sql("select fare, begin_lon, begin_lat, timestamp from hoodie_ro where fare > 2.0").show(); if (tableType.equals(HoodieTableType.COPY_ON_WRITE.name())) { /** * Consume incrementally, only changes in commit 2 above. Currently only supported for COPY_ON_WRITE TABLE */ Dataset<Row> hoodieIncViewDF = spark.read().format("com.uber.hoodie") .option(DataSourceReadOptions.VIEW_TYPE_OPT_KEY(), DataSourceReadOptions.VIEW_TYPE_INCREMENTAL_OPT_VAL()) .option(DataSourceReadOptions.BEGIN_INSTANTTIME_OPT_KEY(), commitInstantTime1) // Only changes in write 2 above .load(tablePath); // For incremental view, pass in the root/base path of dataset logger.info("You will only see records from : " + commitInstantTime2); hoodieIncViewDF.groupBy(hoodieIncViewDF.col("_hoodie_commit_time")).count().show(); } }
From source file:PostgresToSeq.java
License:Apache License
public static void main(String args[]) throws Exception { if (args.length != 2) { System.err.println("Arguments: [input postgres table] [output sequence file]"); return;//w w w.ja v a2 s. com } String inputFileName = args[0]; String outputDirName = args[1]; Configuration configuration = new Configuration(); FileSystem fs = FileSystem.get(configuration); Writer writer = new SequenceFile.Writer(fs, configuration, new Path(outputDirName + "/chunk-0"), Text.class, Text.class); Connection c = null; Statement stmt = null; try { Class.forName("org.postgresql.Driver"); c = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl"); c.setAutoCommit(false); System.out.println("Opened database successfully"); stmt = c.createStatement(); ResultSet rs = stmt.executeQuery("SELECT * FROM " + inputFileName); int count = 0; Text key = new Text(); Text value = new Text(); while (rs.next()) { String seq = rs.getString("seq"); String rep = rs.getString("rep"); String body = rs.getString("body"); String category = rep; String id = seq; String message = body; key.set("/" + category + "/" + id); value.set(message); writer.append(key, value); count++; } rs.close(); stmt.close(); c.close(); writer.close(); System.out.println("Wrote " + count + " entries."); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); System.exit(0); } }
From source file:ClassifierHD.java
License:Apache License
public static void main(String[] args) throws Exception { if (args.length < 5) { System.out.println(/*from w w w .j av a 2 s .c o m*/ "Arguments: [model] [label index] [dictionnary] [document frequency] [postgres table] [hdfs dir] [job_id]"); return; } String modelPath = args[0]; String labelIndexPath = args[1]; String dictionaryPath = args[2]; String documentFrequencyPath = args[3]; String tablename = args[4]; String inputDir = args[5]; Configuration configuration = new Configuration(); // model is a matrix (wordId, labelId) => probability score NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration); StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model); // labels is a map label => classId Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath)); Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath)); Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath)); // analyzer used to extract word from tweet Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_43); int labelCount = labels.size(); int documentCount = documentFrequency.get(-1).intValue(); System.out.println("Number of labels: " + labelCount); System.out.println("Number of documents in training set: " + documentCount); Connection conn = null; PreparedStatement pstmt = null; try { Class.forName("org.postgresql.Driver"); conn = DriverManager.getConnection("jdbc:postgresql://192.168.50.170:5432/uzeni", "postgres", "dbwpsdkdl"); conn.setAutoCommit(false); String sql = "INSERT INTO " + tablename + " (id,gtime,wtime,target,num,link,body,rep) VALUES (?,?,?,?,?,?,?,?);"; pstmt = conn.prepareStatement(sql); FileSystem fs = FileSystem.get(configuration); FileStatus[] status = fs.listStatus(new Path(inputDir)); BufferedWriter bw = new BufferedWriter( new OutputStreamWriter(fs.create(new Path(inputDir + "/rep.list"), true))); for (int i = 0; i < status.length; i++) { BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(status[i].getPath()))); if (new String(status[i].getPath().getName()).equals("rep.list")) { continue; } int lv_HEAD = 1; int lv_cnt = 0; String lv_gtime = null; String lv_wtime = null; String lv_target = null; BigDecimal lv_num = null; String lv_link = null; String[] lv_args; String lv_line; StringBuilder lv_txt = new StringBuilder(); while ((lv_line = br.readLine()) != null) { if (lv_cnt < lv_HEAD) { lv_args = lv_line.split(","); lv_gtime = lv_args[0]; lv_wtime = lv_args[1]; lv_target = lv_args[2]; lv_num = new BigDecimal(lv_args[3]); lv_link = lv_args[4]; } else { lv_txt.append(lv_line + '\n'); } lv_cnt++; } br.close(); String id = status[i].getPath().getName(); String message = lv_txt.toString(); Multiset<String> words = ConcurrentHashMultiset.create(); TokenStream ts = analyzer.tokenStream("text", new StringReader(message)); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); int wordCount = 0; while (ts.incrementToken()) { if (termAtt.length() > 0) { String word = ts.getAttribute(CharTermAttribute.class).toString(); Integer wordId = dictionary.get(word); if (wordId != null) { words.add(word); wordCount++; } } } ts.end(); ts.close(); Vector vector = new RandomAccessSparseVector(10000); TFIDF tfidf = new TFIDF(); for (Multiset.Entry<String> entry : words.entrySet()) { String word = entry.getElement(); int count = entry.getCount(); Integer wordId = dictionary.get(word); Long freq = documentFrequency.get(wordId); double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount); vector.setQuick(wordId, tfIdfValue); } Vector resultVector = classifier.classifyFull(vector); double bestScore = -Double.MAX_VALUE; int bestCategoryId = -1; for (Element element : resultVector.all()) { int categoryId = element.index(); double score = element.get(); if (score > bestScore) { bestScore = score; bestCategoryId = categoryId; } } //System.out.println(message); //System.out.println(" => "+ lv_gtime + lv_wtime + lv_link + id + ":" + labels.get(bestCategoryId)); pstmt.setString(1, id); pstmt.setString(2, lv_gtime); pstmt.setString(3, lv_wtime); pstmt.setString(4, lv_target); pstmt.setBigDecimal(5, lv_num); pstmt.setString(6, lv_link); pstmt.setString(7, message.substring(1, Math.min(50, message.length()))); pstmt.setString(8, labels.get(bestCategoryId)); pstmt.addBatch(); bw.write(id + "\t" + labels.get(bestCategoryId) + "\n"); } pstmt.executeBatch(); //pstmt.clearParameters(); pstmt.close(); conn.commit(); conn.close(); bw.close(); } catch (Exception e) { System.err.println(e.getClass().getName() + ": " + e.getMessage()); System.exit(0); } analyzer.close(); }
From source file:ColumnStorageBasicTest.java
License:Open Source License
public void testLoadNaviFromHead() { try {// w w w.j a va 2s . com Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); Path path = new Path(prefix); fs.delete(path, true); createAllSingleProject(fs); ColumnProject cp = new ColumnProject(conf); cp.loadColmnInfoFromHeadInfo(fs, path); checkAllColumnInfo(cp.infos()); } catch (Exception e) { e.printStackTrace(); fail("get exception:" + e.getMessage()); } }
From source file:ColumnStorageBasicTest.java
License:Open Source License
public void testGetFileNameByFieldIndex() { try {/*w ww .ja v a 2 s . com*/ String navigator = prefix; Path path = new Path(navigator); Configuration conf = new Configuration(); FileSystem fs = FileSystem.get(conf); ColumnProject cp = new ColumnProject(conf); cp.loadColmnInfoFromHeadInfo(fs, path); ArrayList<String> fileList = null; fileList = cp.getFileNameByIndex(null); if (fileList != null) { fail("should null"); } ArrayList<Short> idxs = new ArrayList<Short>(10); idxs.add((short) 10); fileList = cp.getFileNameByIndex(idxs); if (fileList != null) { fail("should null"); } idxs.clear(); idxs.add((short) 0); fileList = cp.getFileNameByIndex(idxs); if (fileList == null) { fail("should not null"); } if (fileList.size() != 1) { fail("error fileList:" + fileList.size()); } if (!fileList.get(0).equals(byteFileName)) { fail("error file name:" + fileList.get(0)); } idxs.clear(); idxs.add((short) 0); idxs.add((short) 5); fileList = cp.getFileNameByIndex(idxs); if (fileList == null) { fail("should not null"); } if (fileList.size() != 2) { fail("error fileList:" + fileList.size()); } if (!fileList.get(0).equals(byteFileName)) { fail("error file name1:" + fileList.get(0)); } if (!fileList.get(1).equals(doubleFileName)) { fail("error file name2:" + fileList.get(1)); } idxs.clear(); idxs.add((short) 0); idxs.add((short) 5); idxs.add((short) 10); fileList = cp.getFileNameByIndex(idxs); if (fileList != null) { fail("should null"); } } catch (Exception e) { e.printStackTrace(); fail("get exception:" + e.getMessage()); } }
From source file:ColumnStorageBasicTest.java
License:Open Source License
public void testConstructorFieldNoExist() { try {//from w ww . j av a 2 s. c om Configuration conf = new Configuration(); Path path = new Path(prefix); FileSystem fs = FileSystem.get(conf); fs.delete(path, true); createAllSingleProject(fs); createMultiProject(fs); ArrayList<Short> idxs = new ArrayList<Short>(10); idxs.add((short) 10); ColumnStorageClient client = new ColumnStorageClient(path, idxs, conf); fail("should get exception"); } catch (SEException.InvalidParameterException e) { } catch (Exception e) { e.printStackTrace(); fail("get exception:" + e.getMessage()); } }