Java tutorial
package org.apache.hawq.pxf.plugins.hdfs; /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ import org.apache.hawq.pxf.api.Analyzer; import org.apache.hawq.pxf.api.AnalyzerStats; import org.apache.hawq.pxf.api.ReadAccessor; import org.apache.hawq.pxf.api.utilities.InputData; import org.apache.hawq.pxf.service.ReadBridge; import org.apache.hawq.pxf.plugins.hdfs.utilities.HdfsUtilities; import org.apache.hawq.pxf.plugins.hdfs.utilities.PxfInputFormat; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapred.InputSplit; import org.apache.hadoop.mapred.JobConf; import java.io.IOException; import java.util.ArrayList; /** * Analyzer class for HDFS data resources * * Given an HDFS data source (a file, directory, or wild card pattern) return * statistics about it (number of blocks, number of tuples, etc.) */ public class HdfsAnalyzer extends Analyzer { private JobConf jobConf; private FileSystem fs; private Log Log; /** * Constructs an HdfsAnalyzer object. * * @param inputData all input parameters coming from the client * @throws IOException if HDFS file system cannot be retrieved */ public HdfsAnalyzer(InputData inputData) throws IOException { super(inputData); Log = LogFactory.getLog(HdfsAnalyzer.class); jobConf = new JobConf(new Configuration(), HdfsAnalyzer.class); fs = FileSystem.get(jobConf); } /** * Collects a number of basic statistics based on an estimate. Statistics * are: number of records, number of hdfs blocks and hdfs block size. * * @param datapath path is a data source URI that can appear as a file name, * a directory name or a wildcard pattern * @return statistics in JSON format * @throws Exception if path is wrong, its metadata cannot be retrieved from * file system, or if scanning the first block using the * accessor failed */ @Override public AnalyzerStats getEstimatedStats(String datapath) throws Exception { long blockSize = 0; long numberOfBlocks; long dataSize = 0; Path path = new Path(HdfsUtilities.absoluteDataPath(datapath)); ArrayList<InputSplit> splits = getSplits(path); for (InputSplit split : splits) { FileSplit fsp = (FileSplit) split; dataSize += fsp.getLength(); if (blockSize == 0) { Path filePath = fsp.getPath(); FileStatus fileStatus = fs.getFileStatus(filePath); if (fileStatus.isFile()) { blockSize = fileStatus.getBlockSize(); } } } // if no file is in path (only dirs), get default block size if (blockSize == 0) { blockSize = fs.getDefaultBlockSize(path); } numberOfBlocks = splits.size(); /* * The estimate of the number of tuples in table is based on the * actual number of tuples in the first block, multiplied by its * size compared to the size of the whole data to be read. * The calculation: * Ratio of tuples to size = number of tuples in first block / first block size. * Total of tuples = ratio * number of blocks * total block size. */ long numberOfTuplesInBlock = getNumberOfTuplesInBlock(splits); long numberOfTuples = 0; if (!splits.isEmpty()) { long blockLength = splits.get(0).getLength(); numberOfTuples = (long) Math.floor((((double) numberOfTuplesInBlock / blockLength) * (dataSize))); } // AnalyzerStats stats = new AnalyzerStats(blockSize, numberOfBlocks, AnalyzerStats stats = new AnalyzerStats(blockSize, numberOfBlocks, numberOfTuples); // print files size to log when in debug level Log.debug(AnalyzerStats.dataToString(stats, path.toString())); return stats; } /** * Calculates the number of tuples in a split (block). Reads one block from * HDFS. Exception during reading will filter upwards and handled in * AnalyzerResource */ private long getNumberOfTuplesInBlock(ArrayList<InputSplit> splits) throws Exception { long tuples = -1; /* default - if we are not able to read data */ ReadAccessor accessor; if (splits.isEmpty()) { return 0; } /* * metadata information includes: file split's start, length and hosts * (locations). */ FileSplit firstSplit = (FileSplit) splits.get(0); byte[] fragmentMetadata = HdfsUtilities.prepareFragmentMetadata(firstSplit); inputData.setFragmentMetadata(fragmentMetadata); inputData.setDataSource(firstSplit.getPath().toUri().getPath()); accessor = ReadBridge.getFileAccessor(inputData); if (accessor.openForRead()) { tuples = 0; while (accessor.readNextObject() != null) { tuples++; } accessor.closeForRead(); } Log.debug("number of tuples in first block: " + tuples); return tuples; } private ArrayList<InputSplit> getSplits(Path path) throws IOException { PxfInputFormat fformat = new PxfInputFormat(); PxfInputFormat.setInputPaths(jobConf, path); InputSplit[] splits = fformat.getSplits(jobConf, 1); ArrayList<InputSplit> result = new ArrayList<InputSplit>(); // remove empty splits if (splits != null) { for (InputSplit split : splits) { if (split.getLength() > 0) { result.add(split); } } } return result; } }