List of usage examples for org.apache.hadoop.fs FileSystem open
public FSDataInputStream open(PathHandle fd) throws IOException
From source file:com.ML_Hadoop.MultipleLinearRegression.MultipleLinearRegressionMapReduce.java
public static void main(String[] args) throws Exception { String[] theta;//w ww.j a va 2 s . c o m int iteration = 0, num_of_iteration = 1; int feature_size = 0, input_data_size = 0; FileSystem fs; Float alpha = 0.1f; do { Configuration conf = new Configuration(); fs = FileSystem.get(conf); Job job = new Job(conf, "LinearRegressionMapReduce"); job.setJarByClass(MultipleLinearRegressionMapReduce.class); // the following two lines are needed for propagating "theta" conf = job.getConfiguration(); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(FloatWritable.class); job.setMapperClass(MultipleLinearRegressionMap.class); job.setReducerClass(MultipleLinearRegressionReduce.class); job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(TextOutputFormat.class); job.setNumReduceTasks(1); // set mapred.reduce.tasks = 1 (only one reducer) FileInputFormat.addInputPath(job, new Path(args[0])); Path out = new Path(args[1]); if (fs.exists(out)) fs.delete(out, true); FileOutputFormat.setOutputPath(job, out); alpha = Float.parseFloat(args[2]); num_of_iteration = Integer.parseInt(args[3]); feature_size = Integer.parseInt(args[4]); input_data_size = Integer.parseInt(args[5]); conf.setFloat("alpha", alpha); conf.setInt("feature_size", feature_size); conf.setInt("input_data_size", input_data_size); conf.setInt("iteration", iteration); theta = new String[feature_size]; if (iteration == 0) { // first iteration for (int i = 0; i < theta.length; i++) theta[i] = "0.0"; conf.setStrings("theta", theta); } else { try { String uri = "/user/hduser/theta.txt"; fs = FileSystem.get(conf); //FSDataInputStream in = fs.open(new Path(uri)); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(new Path(uri)))); theta = br.readLine().split(","); } catch (Exception e) { } conf.setStrings("theta", theta); } for (int i = 0; i < theta.length; i++) System.out.println("In MapRedce main function: theta[ " + i + " ]" + theta[i]); try { job.waitForCompletion(true); iteration++; } catch (IOException e) { e.printStackTrace(); } } while (iteration < num_of_iteration); }
From source file:com.mongodb.hadoop.splitter.BSONSplitter.java
License:Apache License
public void loadSplitsFromSplitFile(final FileStatus inputFile, final Path splitFile) throws NoSplitFileException, IOException { ArrayList<FileSplit> splits = new ArrayList<FileSplit>(); FileSystem fs = splitFile.getFileSystem(getConf()); // throws IOException FileStatus splitFileStatus;// ww w. ja v a 2 s . co m try { splitFileStatus = fs.getFileStatus(splitFile); LOG.info("Found split file at : " + splitFileStatus); } catch (Exception e) { throw new NoSplitFileException(); } FSDataInputStream fsDataStream = fs.open(splitFile); // throws IOException while (fsDataStream.getPos() < splitFileStatus.getLen()) { callback.reset(); bsonDec.decode(fsDataStream, callback); BSONObject splitInfo = (BSONObject) callback.get(); splits.add(createFileSplitFromBSON(splitInfo, fs, inputFile)); } splitsList = splits; }
From source file:com.mongodb.hadoop.splitter.BSONSplitter.java
License:Apache License
public void readSplitsForFile(final FileStatus file) throws IOException { Path path = file.getPath();/* w ww .ja v a 2s. c om*/ ArrayList<FileSplit> splits = new ArrayList<FileSplit>(); FileSystem fs = path.getFileSystem(getConf()); long length = file.getLen(); if (!getConf().getBoolean("bson.split.read_splits", true)) { LOG.info("Reading splits is disabled - constructing single split for " + file); FileSplit onesplit = createFileSplit(file, fs, 0, length); splits.add(onesplit); splitsList = splits; return; } if (length != 0) { int numDocsRead = 0; long splitSize = getSplitSize(getConf(), file); if (LOG.isDebugEnabled()) { LOG.debug("Generating splits for " + path + " of up to " + splitSize + " bytes."); } FSDataInputStream fsDataStream = fs.open(path); long curSplitLen = 0; long curSplitStart = 0; try { while (fsDataStream.getPos() + 1 < length) { lazyCallback.reset(); lazyDec.decode(fsDataStream, lazyCallback); LazyBSONObject bo = (LazyBSONObject) lazyCallback.get(); int bsonDocSize = bo.getBSONSize(); if (curSplitLen + bsonDocSize >= splitSize) { FileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen); splits.add(split); if (LOG.isDebugEnabled()) { LOG.debug(String.format("Creating new split (%d) %s", splits.size(), split)); } curSplitStart = fsDataStream.getPos() - bsonDocSize; curSplitLen = 0; } curSplitLen += bsonDocSize; numDocsRead++; if (numDocsRead % 1000 == 0) { float splitProgress = 100f * ((float) fsDataStream.getPos() / length); if (LOG.isDebugEnabled()) { LOG.debug(String.format("Read %d docs calculating splits for %s; %3.3f%% complete.", numDocsRead, file.getPath(), splitProgress)); } } } if (curSplitLen > 0) { FileSplit split = createFileSplit(file, fs, curSplitStart, curSplitLen); splits.add(split); if (LOG.isDebugEnabled()) { LOG.debug(String.format("Final split (%d) %s", splits.size(), split.getPath())); } } splitsList = splits; if (LOG.isDebugEnabled()) { LOG.debug("Completed splits calculation for " + file.getPath()); } writeSplits(); } catch (IOException e) { LOG.warn("IOException: " + e); } finally { fsDataStream.close(); } } else { LOG.warn("Zero-length file, skipping split calculation."); } }
From source file:com.moz.fiji.mapreduce.input.impl.WholeFileRecordReader.java
License:Apache License
/** * <p>If the file has not already been read, this reads it into memory, so that a call * to getCurrentValue() will return the entire contents of this file as Text, * and getCurrentKey() will return the qualified path to this file as Text. Then, returns * true. If it has already been read, then returns false without updating any internal state.</p> * * @return Whether the file was read or not. * @throws IOException if there is an error reading the file. * @throws InterruptedException if there is an error. *///from w ww . j av a 2s . c o m @Override public boolean nextKeyValue() throws IOException, InterruptedException { if (!mProcessed) { if (mFileLength > (long) Integer.MAX_VALUE) { throw new IOException("File is longer than Integer.MAX_VALUE."); } byte[] contents = new byte[(int) mFileLength]; FileSystem fs = mFileToRead.getFileSystem(mConf); FSDataInputStream in = null; try { // Set the contents of this file. in = fs.open(mFileToRead); IOUtils.readFully(in, contents, 0, contents.length); mFileText.set(contents, 0, contents.length); // Set the name of this file. String fileName = mFileToRead.makeQualified(fs).toString(); mFileName.set(fileName); } finally { IOUtils.closeStream(in); } mProcessed = true; return true; } return false; }
From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java
License:Apache License
/** Test FijiTableInputFormat in a map-only job. */ @Test//from www . ja v a 2 s . c om public void testMapJob() throws Exception { final Path outputFile = createOutputFile(); // Create a test job. final Job job = setupJob("testMapJob", outputFile, TestMapper.class, null, // reducer class null, // start key null, // limit key null); // filter // Run the job. assertTrue("Hadoop job failed", job.waitForCompletion(true)); // Check to make sure output exists. final FileSystem fs = FileSystem.get(job.getConfiguration()); assertTrue(fs.exists(outputFile.getParent())); // Verify that the output matches what's expected. final FSDataInputStream in = fs.open(outputFile); final Set<String> actual = Sets.newHashSet(IOUtils.toString(in).trim().split("\n")); final Set<String> expected = Sets.newHashSet("usermail.example.com\tAaron Kimball", "gmail.com\tJohn Doe", "usermail.example.com\tChristophe Bisciglia", "usermail.example.com\tKiyan Ahmadizadeh", "gmail.com\tJane Doe", "usermail.example.com\tGarrett Wu"); assertEquals("Result of job wasn't what was expected", expected, actual); // Clean up. fs.delete(outputFile.getParent(), true); IOUtils.closeQuietly(in); // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that // causes it to close other thread's filesystem objects. For more information // see: https://issues.apache.org/jira/browse/HADOOP-7973 }
From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java
License:Apache License
/** Test FijiTableInputFormat in a map-only job with start and limit keys. */ @Test/*from w w w .j a v a2 s . c om*/ public void testMapJobWithStartAndLimitKeys() throws Exception { final Path outputFile = createOutputFile(); // Set the same entity IDs for start and limit, and we should get just the start row final EntityId startEntityId = getFooTable().getEntityId("jane.doe@gmail.com"); final byte[] endRowKey = startEntityId.getHBaseRowKey(); final EntityId rawLimitEntityId = HBaseEntityId .fromHBaseRowKey(Arrays.copyOf(endRowKey, endRowKey.length + 1)); // Create a test job. final Job job = setupJob("testMapJobWithStartAndLimitKeys", outputFile, TestMapper.class, null, // reducer class startEntityId, rawLimitEntityId, null); // filter // Run the job. assertTrue("Hadoop job failed", job.waitForCompletion(true)); // Check to make sure output exists. final FileSystem fs = FileSystem.get(job.getConfiguration()); assertTrue(fs.exists(outputFile.getParent())); // Verify that the output matches what's expected. final FSDataInputStream in = fs.open(outputFile); final Set<String> actual = Sets.newHashSet(IOUtils.toString(in).trim().split("\n")); final Set<String> expected = Sets.newHashSet("gmail.com\tJane Doe"); assertEquals("Result of job wasn't what was expected", expected, actual); // Clean up. fs.delete(outputFile.getParent(), true); IOUtils.closeQuietly(in); // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that // causes it to close other thread's filesystem objects. For more information // see: https://issues.apache.org/jira/browse/HADOOP-7973 }
From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java
License:Apache License
/** Test FijiTableInputFormat in a map-only job with a row filter. */ @Test// www . ja va 2 s .com public void testMapJobWithFilter() throws Exception { final FijiRowFilter filter = new ColumnValueEqualsRowFilter("info", "email", new DecodedCell<String>(Schema.create(Schema.Type.STRING), "aaron@usermail.example.com")); final Path outputFile = createOutputFile(); // Create a test job. final Job job = setupJob("testMapJobWithFilter", outputFile, TestMapper.class, null, // reducer class null, // start key null, // limit key filter); // Run the job. assertTrue("Hadoop job failed", job.waitForCompletion(true)); // Check to make sure output exists. final FileSystem fs = FileSystem.get(job.getConfiguration()); assertTrue(fs.exists(outputFile.getParent())); // Verify that the output matches what's expected. final FSDataInputStream in = fs.open(outputFile); final Set<String> actual = Sets.newHashSet(IOUtils.toString(in).trim().split("\n")); final Set<String> expected = Sets.newHashSet("usermail.example.com\tAaron Kimball"); assertEquals("Result of job wasn't what was expected", expected, actual); // Clean up. fs.delete(outputFile.getParent(), true); IOUtils.closeQuietly(in); // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that // causes it to close other thread's filesystem objects. For more information // see: https://issues.apache.org/jira/browse/HADOOP-7973 }
From source file:com.moz.fiji.mapreduce.IntegrationTestFijiTableInputFormat.java
License:Apache License
/** Test FijiTableInputFormat in a MapReduce job. */ @Test// w w w .j ava 2s. com public void testMapReduceJob() throws Exception { final Path outputFile = createOutputFile(); // Create a test job. final Job job = setupJob("testMapReduceJob", outputFile, TestMapper.class, TestReducer.class, null, // start key null, // limit key null); // filter // Run the job. assertTrue("Hadoop job failed", job.waitForCompletion(true)); // Check to make sure output exists. final FileSystem fs = FileSystem.get(job.getConfiguration()); assertTrue(fs.exists(outputFile.getParent())); // Verify that the output matches what's expected. final FSDataInputStream in = fs.open(outputFile); final Set<String> output = Sets.newHashSet(IOUtils.toString(in).trim().split("\n")); final ImmutableMap.Builder<String, Set<String>> builder = ImmutableMap.builder(); for (String line : output) { final String[] keyValue = line.split("\t"); final String emailDomain = keyValue[0]; final Set<String> names = Sets.newHashSet(keyValue[1].split(",")); builder.put(emailDomain, names); } final Map<String, Set<String>> actual = builder.build(); final Map<String, Set<String>> expected = ImmutableMap.<String, Set<String>>builder() .put("usermail.example.com", Sets.newHashSet("Aaron Kimball", "Christophe Bisciglia", "Kiyan Ahmadizadeh", "Garrett Wu")) .put("gmail.com", Sets.newHashSet("John Doe", "Jane Doe")).build(); assertEquals("Result of job wasn't what was expected", expected, actual); // Clean up. fs.delete(outputFile.getParent(), true); IOUtils.closeQuietly(in); // NOTE: fs should get closed here, but doesn't because of a bug with FileSystem that // causes it to close other thread's filesystem objects. For more information // see: https://issues.apache.org/jira/browse/HADOOP-7973 }
From source file:com.moz.fiji.mapreduce.lib.bulkimport.DescribedInputTextBulkImporter.java
License:Apache License
/** * Sets the path to the text input descriptor file and parses it. * * @param inputDescriptorFile The input descriptor path. * @throws RuntimeException if there's an error reading or parsing the input descriptor. *///from ww w.j av a 2s. c om @HadoopConf(key = CONF_FILE, usage = "The input descriptor file.") protected final void setInputDescriptorPath(String inputDescriptorFile) { if (null == inputDescriptorFile || inputDescriptorFile.isEmpty()) { // Remind the user to specify this path. LOG.error("No input-descriptor path specified."); throw new RuntimeException("No input descriptor file specified on the Configuration." + " Did you specify the " + CONF_FILE + " variable?"); } Path descriptorPath = new Path(inputDescriptorFile); try { LOG.info("Parsing input-descriptor file: " + descriptorPath.toString()); FileSystem fs = descriptorPath.getFileSystem(getConf()); FSDataInputStream inputStream = fs.open(descriptorPath); mTableImportDescriptor = FijiTableImportDescriptor.createFromEffectiveJson(inputStream); } catch (IOException ioe) { LOG.error("Could not read input-descriptor file: " + descriptorPath.toString()); throw new RuntimeException("Could not read file: " + descriptorPath.toString(), ioe); } }
From source file:com.moz.fiji.schema.tools.CreateTableTool.java
License:Apache License
/** {@inheritDoc} */ @Override/*from w w w . j a v a2s.c o m*/ protected int run(List<String> nonFlagArgs) throws Exception { getPrintStream().println("Parsing table layout: " + mLayout); final Path path = new Path(mLayout); final FileSystem fs = fileSystemSpecified(path) ? path.getFileSystem(getConf()) : FileSystem.getLocal(getConf()); final FSDataInputStream inputStream = fs.open(path); final TableLayoutDesc tableLayout = FijiTableLayout.readTableLayoutDescFromJSON(inputStream); final String tableName = tableLayout.getName(); Preconditions.checkArgument((mTableURI.getTable() == null) || tableName.equals(mTableURI.getTable()), "Table name '%s' does not match URI %s", tableName, mTableURI); // For large numbers of initial regions, table creation may take a long time as we wait for // the new regions to come online. Increase the hbase RPC timeout to compensate. int hbaseTimeout = getConf().getInt("hbase.rpc.timeout", 60000); hbaseTimeout = hbaseTimeout * 10; getConf().setInt("hbase.rpc.timeout", hbaseTimeout); getPrintStream().println("Creating Fiji table " + mTableURI); if (mNumRegions >= 1) { // Create a table with an initial number of evenly split regions. mFiji.createTable(tableLayout, mNumRegions); } else if (!mSplitKeyFilePath.isEmpty()) { switch (FijiTableLayout.getEncoding(tableLayout.getKeysFormat())) { case HASH: case HASH_PREFIX: throw new IllegalArgumentException( "Row key hashing is enabled for the table. Use --num-regions=N instead."); case RAW: break; case FORMATTED: // TODO Support pre-splitting tables for FORMATTED RKF // (https://jira.fiji.org/browse/SCHEMA-172) throw new RuntimeException("CLI support for FORMATTED row keys is not yet available"); default: throw new RuntimeException( "Unexpected row key encoding: " + FijiTableLayout.getEncoding(tableLayout.getKeysFormat())); } // Open the split key file. final Path splitKeyFilePath = new Path(mSplitKeyFilePath); final FileSystem splitKeyPathFs = fileSystemSpecified(splitKeyFilePath) ? splitKeyFilePath.getFileSystem(getConf()) : FileSystem.getLocal(getConf()); final FSDataInputStream splitKeyFileInputStream = splitKeyPathFs.open(splitKeyFilePath); // Read the split keys. final List<byte[]> splitKeys = SplitKeyFile.decodeRegionSplitList(splitKeyFileInputStream); LOG.debug("Read {} keys from split-key-file '{}':", splitKeys.size(), splitKeyFilePath); for (int i = 0; i < splitKeys.size(); ++i) { LOG.debug("Split key #{}: {}", i, Bytes.toStringBinary(splitKeys.get(i))); } // Create the table with the given split keys. mFiji.createTable(tableLayout, splitKeys.toArray(new byte[splitKeys.size()][])); } else { // Create a table with a single initial region: mFiji.createTable(tableLayout); } return SUCCESS; }