Example usage for org.apache.hadoop.io DataInputBuffer DataInputBuffer

List of usage examples for org.apache.hadoop.io DataInputBuffer DataInputBuffer

Introduction

In this page you can find the example usage for org.apache.hadoop.io DataInputBuffer DataInputBuffer.

Prototype

public DataInputBuffer() 

Source Link

Document

Constructs a new empty buffer.

Usage

From source file:org.commoncrawl.service.queryserver.index.DatabaseIndexV2.java

License:Open Source License

public static void main(String[] args) {
    if (args.length != 3) {
        LOG.error("args: [candidate Timestamp] [drive count] [query string]");
    }//from w  w  w .  j av  a2  s  . c o  m

    // initialize ...
    final Configuration conf = new Configuration();

    conf.addResource("nutch-default.xml");
    conf.addResource("nutch-site.xml");
    conf.addResource("core-site.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("mapred-site.xml");

    BasicConfigurator.configure();
    CrawlEnvironment.setHadoopConfig(conf);

    long candidateTS = Long.parseLong(args[0]);
    int driveCount = Integer.parseInt(args[1]);
    String queryString = args[2];

    try {
        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

        MasterDatabaseIndex masterIndex = new MasterDatabaseIndex(conf, fs, driveCount, candidateTS, null);
        SlaveDatabaseIndex slaveIndex = new SlaveDatabaseIndex(conf, fs, candidateTS);

        // ok hit the domain against the master index first ...
        LOG.info("Querying master index for DomainId Given DomainName:" + queryString);
        long domainId = masterIndex.queryDomainIdGivenDomain(queryString);

        LOG.info("Querying master index for DomainMetadata Given DomainId:" + domainId);
        SubDomainMetadata subDomainMeta = masterIndex.queryDomainMetadataGivenDomainId(domainId);

        if (subDomainMeta != null) {

            LOG.info("Metadata is present. Deserializing");
            // dump some fields ...
            LOG.info("Domain:" + subDomainMeta.getDomainText() + " URLCount:" + subDomainMeta.getUrlCount()
                    + " FetchedCount:" + subDomainMeta.getFetchedCount() + " PageRankCount:"
                    + subDomainMeta.getHasPageRankCount());

            // ok time to dive into a url list ...

            // query for a list of urls sorted by name
            LOG.info("Querying for URLList for Domain BY PR");
            FlexBuffer urlListBufferByPR = slaveIndex.queryURLListSortedByPR(domainId);

            if (urlListBufferByPR != null) {

                // read the list ...
                DataInputBuffer readerStream = new DataInputBuffer();
                readerStream.reset(urlListBufferByPR.get(), urlListBufferByPR.getCount());
                int totalItemCount = urlListBufferByPR.getCount() / 8;
                System.out.println("List BY  PR totalCount:" + totalItemCount);

                // initialize a fingerprint object to use for queries ...
                URLFPV2 queryFP = new URLFPV2();

                queryFP.setDomainHash(domainId);

                DataInputBuffer metadataReaderStream = new DataInputBuffer();
                // iterate the first N items ranked by page rank
                for (int i = 0; i < Math.min(10, totalItemCount); ++i) {

                    queryFP.setUrlHash(readerStream.readLong());

                    // and for metadata
                    MetadataOut urlMetadata = masterIndex.queryMetadataAndURLGivenFP(queryFP);

                    if (urlMetadata != null) {

                        // decode the url
                        String url = urlMetadata.url.toString();

                        System.out.println("URL for FP:" + queryFP.getUrlHash() + " is:" + url);
                        if (urlMetadata.datumAndMetadataBytes.getLength() == 0) {
                            System.out.println("URL for FP:" + queryFP.getUrlHash() + " had no METADATA!!");
                        } else {

                            // explode metadata
                            CrawlDatumAndMetadata metadataObject = new CrawlDatumAndMetadata();

                            metadataReaderStream.reset(urlMetadata.datumAndMetadataBytes.getBytes(),
                                    urlMetadata.datumAndMetadataBytes.getOffset(),
                                    urlMetadata.datumAndMetadataBytes.getLength());
                            metadataObject.readFields(metadataReaderStream);

                            // ok at this point spit out stuff for this url
                            StringBuilder urlInfo = new StringBuilder();

                            urlInfo.append("    FetchStatus:"
                                    + CrawlDatum.getStatusName(metadataObject.getStatus()) + "\n");
                            urlInfo.append("    PageRank:" + metadataObject.getMetadata().getPageRank() + "\n");
                            urlInfo.append(
                                    "    ContentType:" + metadataObject.getMetadata().getContentType() + "\n");
                            urlInfo.append("    ArcFileInfoCount:"
                                    + metadataObject.getMetadata().getArchiveInfo().size());
                            if (metadataObject.getMetadata()
                                    .isFieldDirty(CrawlURLMetadata.Field_LINKDBFILENO)) {
                                urlInfo.append(
                                        "    HasLinkDataInfo:" + metadataObject.getMetadata().getLinkDBFileNo()
                                                + ":" + metadataObject.getMetadata().getLinkDBOffset());
                            }
                            if (metadataObject.getMetadata()
                                    .isFieldDirty(CrawlURLMetadata.Field_INVERSEDBFILENO)) {
                                urlInfo.append("    HasINVLinkDataInfo:"
                                        + metadataObject.getMetadata().getInverseDBFileNo() + ":"
                                        + metadataObject.getMetadata().getInverseDBOffset());
                            }
                            System.out.println(urlInfo.toString());

                            // now if inverse link data is present ..
                            if (metadataObject.getMetadata()
                                    .isFieldDirty(CrawlURLMetadata.Field_INVERSEDBFILENO)) {
                                // get it ...
                                System.out.println("Querying for Inlinks for FP:" + queryFP.getUrlHash());
                                FlexBuffer inlinks = slaveIndex.queryInlinksByFP(queryFP,
                                        metadataObject.getMetadata().getInverseDBFileNo(),
                                        metadataObject.getMetadata().getInverseDBOffset());

                                if (inlinks != null) {
                                    System.out.println("Found Inlink Buffer of Size:" + inlinks.getCount());
                                    FileSystem localFS = FileSystem.getLocal(conf);
                                    File testDir = new File("/tmp/dbIndexTest");
                                    File testFile = new File("/tmp/dbIndexTestFile");
                                    localFS.delete(new Path(testDir.getAbsolutePath()), true);
                                    localFS.delete(new Path(testFile.getAbsolutePath()), false);
                                    localFS.mkdirs(new Path(testDir.getAbsolutePath()));

                                    LOG.info("Creating Spill File of Inlinks");
                                    spillLinkDataIntoTempFileIndex(fs, localFS, conf, masterIndex, testDir,
                                            new Path(testFile.getAbsolutePath()), inlinks);
                                    LOG.info("Created Spill File of Inlinks");

                                    LOG.info("Reading Inlinks");
                                    // ok now open it up and dump the first few inlinks from the
                                    // spill file
                                    SequenceFile.Reader reader = new SequenceFile.Reader(localFS,
                                            new Path(testFile.getAbsolutePath()), conf);

                                    TextBytes key = new TextBytes();
                                    TriTextBytesTuple value = new TriTextBytesTuple();
                                    CrawlDatumAndMetadata metadata = new CrawlDatumAndMetadata();
                                    DataInputBuffer inputBuffer = new DataInputBuffer();

                                    try {
                                        int itemCount = 0;

                                        while (reader.next(key, value)) {

                                            if (value.getThirdValue().getLength() != 0) {
                                                inputBuffer.reset(value.getThirdValue().getBytes(), 0,
                                                        value.getThirdValue().getLength());
                                                metadata.readFields(inputBuffer);
                                                System.out.println("INLINK:" + key.toString()
                                                        + " METADATA STATUS:"
                                                        + CrawlDatum.getStatusName(metadata.getStatus()));
                                            } else {
                                                System.out.println("INLINK:" + key.toString() + " NOMETADATA");
                                            }

                                            if (++itemCount == 500) {
                                                break;
                                            }
                                        }
                                    } finally {
                                        reader.close();
                                    }

                                    LOG.info("Done Reding Inlinks");
                                }
                            }
                        }
                    } else {
                        LOG.error("Query for FP:" + queryFP.getUrlHash() + " returned NULL URL");
                    }
                }
            }
        }

    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    }
}

From source file:org.commoncrawl.service.queryserver.query.DomainListQuery.java

License:Open Source License

@Override
protected long executeLocal(FileSystem remoteFileSystem, Configuration conf,
        DatabaseIndexV2.MasterDatabaseIndex index, EventLoop eventLoop, File tempFirDir,
        QueryRequest<DomainListQueryInfo, Text, SubDomainMetadata> requestObject) throws IOException {

    Path mergeResultsPath = new Path(
            getLocalQueryResultsPathPrefix(requestObject) + getMergedResultsFileName());

    LOG.info("Execute Local called for Query:" + getQueryId() + " MergeResultsPath is:" + mergeResultsPath);

    // get a local file system object
    FileSystem localFileSystem = FileSystem.getLocal(conf);

    //LOG.info("Executing LocalQuery - checking if MergedFile:" + mergeResultsPath + " Exists");
    // if source merged results path does not exist ... 
    if (!localFileSystem.exists(mergeResultsPath)) {
        LOG.info("Execute Local for Query:" + getQueryId() + " Source MergeFile:" + mergeResultsPath
                + " Not Found. Checking for parts files");
        // collect parts ...
        Vector<Path> parts = new Vector<Path>();

        FileStatus fileStatusArray[] = remoteFileSystem
                .globStatus(new Path(getHDFSQueryResultsPath(), "part-*"));

        if (fileStatusArray.length == 0) {
            LOG.error("Execute Local for Query:" + getQueryId() + " FAILED. No Parts Files Found!");
            throw new IOException("Remote Component Part Files Not Found");
        }/* w w w  . j  a va2  s.c  om*/

        for (FileStatus part : fileStatusArray) {
            //LOG.info("Found Part:"+ part);
            parts.add(part.getPath());
        }

        LOG.info("Execute Local for Query:" + getQueryId() + " Initializing Merger");
        SequenceFileSpillWriter<Text, SubDomainMetadata> mergedFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>(
                localFileSystem, conf, mergeResultsPath, Text.class, SubDomainMetadata.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(mergeResultsPath)),
                false);

        try {
            SequenceFileMerger<Text, SubDomainMetadata> merger = new SequenceFileMerger<Text, SubDomainMetadata>(
                    remoteFileSystem, conf, parts, mergedFileSpillWriter, Text.class, SubDomainMetadata.class,

                    new RawKeyValueComparator<Text, SubDomainMetadata>() {

                        DataInputBuffer key1Stream = new DataInputBuffer();
                        DataInputBuffer key2Stream = new DataInputBuffer();

                        @Override
                        public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data,
                                int key2Offset, int key2Length, byte[] value1Data, int value1Offset,
                                int value1Length, byte[] value2Data, int value2Offset, int value2Length)
                                throws IOException {

                            key1Stream.reset(key1Data, key1Offset, key1Length);
                            key2Stream.reset(key2Data, key2Offset, key2Length);

                            WritableUtils.readVInt(key1Stream);
                            WritableUtils.readVInt(key2Stream);

                            return BytesWritable.Comparator.compareBytes(key1Data, key1Stream.getPosition(),
                                    key1Length - key1Stream.getPosition(), key2Data, key2Stream.getPosition(),
                                    key2Length - key2Stream.getPosition());
                        }

                        @Override
                        public int compare(Text key1, SubDomainMetadata value1, Text key2,
                                SubDomainMetadata value2) {
                            return key1.compareTo(key2);
                        }

                    });

            try {
                LOG.info("Execute Local for Query:" + getQueryId() + " Running Merger");
                merger.mergeAndSpill(null);
                LOG.info("Execute Local for Query:" + getQueryId()
                        + " Merge Successfull.. Deleting Merge Inputs");
                for (Path inputPath : parts) {
                    remoteFileSystem.delete(inputPath, false);
                }
            } catch (IOException e) {
                LOG.error("Execute Local for Query:" + getQueryId() + " Merge Failed with Exception:"
                        + CCStringUtils.stringifyException(e));
                throw e;
            } finally {
                LOG.info("** CLOSING MERGER");
                merger.close();
            }
        } finally {
            LOG.info("** FLUSHING SPILLWRITER");
            mergedFileSpillWriter.close();
        }
    }

    // now check for query specific merge file ...
    Path queryResultsPath = new Path(getLocalQueryResultsPathPrefix(requestObject)
            + getOutputFileNameBasedOnSortByField(requestObject.getClientQueryInfo().getSortByField()));

    LOG.info("Execute Local for Query:" + getQueryId() + " Checking for QueryResultsPath:" + queryResultsPath);

    if (!localFileSystem.exists(queryResultsPath)) {

        LOG.info("Exectue Local for Query:" + getQueryId() + " Results File:" + queryResultsPath
                + " does not exist. Running sort and merge process");

        LOG.info("Execute Local for Query:" + getQueryId() + " Allocating SpillWriter with output to:"
                + queryResultsPath);
        // allocate a spill writer ...  
        SequenceFileSpillWriter<Text, SubDomainMetadata> sortedResultsFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>(
                localFileSystem, conf, queryResultsPath, Text.class, SubDomainMetadata.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(queryResultsPath)),
                false);

        try {

            LOG.info("Execute Local for Query:" + getQueryId() + " Allocating MergeSortSpillWriter");
            // and connect it to the merge spill writer ...
            MergeSortSpillWriter<Text, SubDomainMetadata> mergeSortSpillWriter = new MergeSortSpillWriter<Text, SubDomainMetadata>(
                    conf, sortedResultsFileSpillWriter, localFileSystem, new Path(tempFirDir.getAbsolutePath()),
                    /*
                    new RawKeyValueComparator<Text,SubDomainMetadata>() {
                            
                      SubDomainMetadata value1 = new SubDomainMetadata();
                      SubDomainMetadata value2 = new SubDomainMetadata();
                              
                            
                      @Override
                      public int compare(Text key1, SubDomainMetadata value1, Text key2,SubDomainMetadata value2) {
                        return value1.getUrlCount() - value2.getUrlCount();
                      }
                            
                      @Override
                      public int compareRaw(byte[] key1Data, int key1Offset,
                          int key1Length, byte[] key2Data, int key2Offset,
                          int key2Length, byte[] value1Data, int value1Offset,
                          int value1Length, byte[] value2Data, int value2Offset,
                          int value2Length) throws IOException {
                            
                        value1.clear();
                        value2.clear();
                                
                        value1.readFields(new DataInputStream(new ByteArrayInputStream(value1Data,value1Offset,value1Length)));
                        value2.readFields(new DataInputStream(new ByteArrayInputStream(value2Data,value2Offset,value2Length)));
                                
                        return compare(null, value1, null, value2);
                      } 
                              
                    },
                    */
                    new OptimizedKeyGeneratorAndComparator<Text, SubDomainMetadata>() {

                        @Override
                        public void generateOptimizedKeyForPair(Text key, SubDomainMetadata value,
                                org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator.OptimizedKey optimizedKeyOut)
                                throws IOException {
                            optimizedKeyOut.setLongKeyValue(value.getUrlCount());
                        }

                        @Override
                        public int getGeneratedKeyType() {
                            return OptimizedKey.KEY_TYPE_LONG;
                        }
                    }, Text.class, SubDomainMetadata.class, false, null);

            try {

                // create a vector representing the single input segment 
                Vector<Path> singleInputSegment = new Vector<Path>();

                LOG.info("Execute Local for Query:" + getQueryId() + " Adding MergeResultsPath:"
                        + mergeResultsPath + " as input for Merger");
                singleInputSegment.add(mergeResultsPath);

                // create a SequenceFileReader
                SequenceFileReader<Text, SubDomainMetadata> mergeSegmentReader = new SequenceFileReader<Text, SubDomainMetadata>(
                        localFileSystem, conf, singleInputSegment, mergeSortSpillWriter, Text.class,
                        SubDomainMetadata.class);

                try {
                    LOG.info("Execute Local for Query:" + getQueryId() + " calling readAndSpill");
                    mergeSegmentReader.readAndSpill();
                    LOG.info("Execute Local for Query:" + getQueryId() + " readAndSpill finished");
                } finally {
                    if (mergeSegmentReader != null) {
                        mergeSegmentReader.close();
                    }
                }

            } finally {
                if (mergeSortSpillWriter != null) {
                    mergeSortSpillWriter.close();
                }
            }

        } finally {
            if (sortedResultsFileSpillWriter != null) {
                sortedResultsFileSpillWriter.close();
            }
        }
    }

    //LOG.info("Allocating SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath);
    PositionBasedSequenceFileIndex<Text, SubDomainMetadata> indexFile = new PositionBasedSequenceFileIndex<Text, SubDomainMetadata>(
            localFileSystem, queryResultsPath, Text.class, SubDomainMetadata.class);
    //LOG.info("SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath + " returned record count:" + indexFile.getRecordCount());

    return indexFile.getRecordCount();
}

From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java

License:Open Source License

static void collectAllTopLevelDomainRecordsByDomain(FileSystem fs, Configuration conf, long databaseId,
        long targetRootDomainFP, FileSystem outputFileSystem, Path finalOutputPath) throws IOException {

    File tempFile = new File("/tmp/inverseLinksReport-" + System.currentTimeMillis());
    tempFile.mkdir();//from w  w  w .  j  a  v  a  2  s  .  c o  m

    try {
        // create the final output spill writer ...  
        SequenceFileSpillWriter<FlexBuffer, URLFPV2> spillwriter = new SequenceFileSpillWriter<FlexBuffer, URLFPV2>(
                outputFileSystem, conf, finalOutputPath, FlexBuffer.class, URLFPV2.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(outputFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(finalOutputPath)),
                true);

        try {

            MergeSortSpillWriter<FlexBuffer, URLFPV2> finalMerger = new MergeSortSpillWriter<FlexBuffer, URLFPV2>(
                    conf, spillwriter, FileSystem.getLocal(conf), new Path(tempFile.getAbsolutePath()), null,
                    new ComplexKeyComparator(), FlexBuffer.class, URLFPV2.class, true, null);

            try {

                for (int targetShardId = 0; targetShardId < CrawlEnvironment.NUM_DB_SHARDS; ++targetShardId) {
                    // 0. shard domain id to find index file location ... 
                    int indexShardId = (int) ((targetRootDomainFP & Integer.MAX_VALUE)
                            % CrawlEnvironment.NUM_DB_SHARDS);
                    // build path to index file 
                    Path indexFilePath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId
                            + "/phase3Data/part-" + NUMBER_FORMAT.format(indexShardId));
                    LOG.info("rootDomain is:" + targetRootDomainFP + " ShardId:" + indexShardId + " Index Path:"
                            + indexFilePath);
                    // 1. scan domainFP to index file first
                    // 2. given index, scan index->pos file to find scan start position
                    // 3. given scan start position, scan forward until fp match is found.
                    // 4. collect all matching entries and output to a file ? 

                    FSDataInputStream indexDataInputStream = fs.open(indexFilePath);
                    try {
                        TFile.Reader reader = new TFile.Reader(indexDataInputStream,
                                fs.getLength(indexFilePath), conf);
                        try {
                            TFile.Reader.Scanner scanner = reader.createScanner();

                            try {
                                // generate key ... 
                                DataOutputBuffer keyBuffer = new DataOutputBuffer();
                                keyBuffer.writeLong(targetRootDomainFP);
                                if (scanner.seekTo(keyBuffer.getData(), 0, keyBuffer.getLength())) {
                                    // setup for value scan 
                                    DataInputStream valueStream = scanner.entry().getValueStream();
                                    int dataOffsetOut = -1;
                                    while (valueStream.available() > 0) {
                                        // read entries looking for our specific entry
                                        int shardIdx = valueStream.readInt();
                                        int dataOffset = valueStream.readInt();
                                        if (shardIdx == targetShardId) {
                                            dataOffsetOut = dataOffset;
                                            break;
                                        }
                                    }
                                    LOG.info("Index Search Yielded:" + dataOffsetOut);
                                    if (dataOffsetOut != -1) {
                                        // ok create a data path 
                                        Path finalDataPath = new Path("crawl/inverseLinkDB_ByDomain/"
                                                + databaseId + "/phase2Data/data-"
                                                + NUMBER_FORMAT.format(targetShardId));
                                        Path finalDataIndexPath = new Path("crawl/inverseLinkDB_ByDomain/"
                                                + databaseId + "/phase2Data/data-"
                                                + NUMBER_FORMAT.format(targetShardId) + ".index");
                                        // check to see if index is already loaded ... 
                                        PositionBasedSequenceFileIndex<FlexBuffer, TextBytes> index = null;
                                        synchronized (_shardToIndexMap) {
                                            index = _shardToIndexMap.get(targetShardId);
                                        }
                                        if (index == null) {
                                            LOG.info("Loading Index from Path:" + finalDataIndexPath);
                                            // load index
                                            index = new PositionBasedSequenceFileIndex<FlexBuffer, TextBytes>(
                                                    fs, finalDataIndexPath, FlexBuffer.class, TextBytes.class);
                                            // put in cache
                                            synchronized (_shardToIndexMap) {
                                                _shardToIndexMap.put(targetShardId, index);
                                            }
                                        }

                                        LOG.info("Initializing Data Reader at Path:" + finalDataPath);
                                        // ok time to create a reader 
                                        SequenceFile.Reader dataReader = new SequenceFile.Reader(fs,
                                                finalDataPath, conf);

                                        try {
                                            LOG.info("Seeking Reader to Index Position:" + dataOffsetOut);
                                            index.seekReaderToItemAtIndex(dataReader, dataOffsetOut);

                                            FlexBuffer keyBytes = new FlexBuffer();
                                            URLFPV2 sourceFP = new URLFPV2();
                                            DataInputBuffer keyReader = new DataInputBuffer();
                                            TextBytes urlTxt = new TextBytes();

                                            // ok read to go ... 
                                            while (dataReader.next(keyBytes, sourceFP)) {
                                                // initialize reader 
                                                keyReader.reset(keyBytes.get(), keyBytes.getOffset(),
                                                        keyBytes.getCount());

                                                long targetFP = keyReader.readLong();

                                                if (targetRootDomainFP == targetFP) {
                                                    finalMerger.spillRecord(keyBytes, sourceFP);
                                                } else {
                                                    LOG.info("FP:" + targetFP + " > TargetFP:"
                                                            + targetRootDomainFP + " Exiting Iteration Loop");
                                                    break;
                                                }
                                            }
                                        } finally {
                                            LOG.info("Closing Reader");
                                            dataReader.close();
                                        }
                                    }
                                }
                            } finally {
                                LOG.info("Closing Scanner");
                                scanner.close();
                            }

                        } finally {
                            LOG.info("Closing TFile Reader");
                            reader.close();
                        }
                    } finally {
                        LOG.info("Closing InputStream");
                        indexDataInputStream.close();
                    }
                }
            } finally {
                finalMerger.close();
            }
        } finally {
            spillwriter.close();
        }
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        FileUtils.recursivelyDeleteFile(tempFile);
    }

}

From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java

License:Open Source License

public static void main(String[] args) {
    // initialize ...
    Configuration conf = new Configuration();

    conf.addResource("nutch-default.xml");
    conf.addResource("nutch-site.xml");
    conf.addResource("core-site.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("mapred-site.xml");

    LOG.info("URL:" + args[0] + " ShardId:" + args[1]);

    try {//from  w w  w  .  ja va 2s.c om
        File tempFile = File.createTempFile("inverseLinksReportTest", "seq");
        try {
            FileSystem fs = FileSystem.get(conf);
            FileSystem localFileSystem = FileSystem.getLocal(conf);

            URLFPV2 fp = URLUtils.getURLFPV2FromURL(args[0]);
            if (fp != null) {
                collectAllTopLevelDomainRecordsByDomain(fs, conf, 1282844121161L, fp.getRootDomainHash(),
                        localFileSystem, new Path(tempFile.getAbsolutePath()));

                SequenceFile.Reader reader = new SequenceFile.Reader(localFileSystem,
                        new Path(tempFile.getAbsolutePath()), conf);
                try {
                    FlexBuffer key = new FlexBuffer();
                    URLFPV2 src = new URLFPV2();
                    TextBytes url = new TextBytes();

                    DataInputBuffer inputBuffer = new DataInputBuffer();

                    while (reader.next(key, src)) {
                        inputBuffer.reset(key.get(), key.getOffset(), key.getCount());
                        long targetFP = inputBuffer.readLong();
                        float pageRank = inputBuffer.readFloat();
                        // ok initialize text bytes ... 
                        int textLen = WritableUtils.readVInt(inputBuffer);
                        url.set(key.get(), inputBuffer.getPosition(), textLen);
                        LOG.info("PR:" + pageRank + " URL:" + url.toString());
                    }
                } finally {
                    reader.close();
                }
            }
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            // tempFile.delete();
        }
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    }
}

From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java

License:Open Source License

private static void readPaginatedInlinkingDomainInfo(final DatabaseIndexV2.MasterDatabaseIndex masterIndex,
        FileSystem indexFileSystem, Path indexPath, Path detailPath, int sortOrder, int pageNumber,
        int pageSize, QueryResult<Writable, Writable> resultOut) throws IOException {
    // if descending sort order ... 
    // take pageNumber * pageSize as starting point
    long offset = 0;
    long startPos = 0;
    long endPos = 0;

    FSDataInputStream indexStream = indexFileSystem.open(indexPath);

    try {/*from w  w w  .j ava2  s.  c om*/

        // read in the total record count ... 
        int totalRecordCount = indexStream.readInt();

        LOG.info("***RecordCount:" + totalRecordCount + " Allocating Buffer Of:" + (totalRecordCount * 4)
                + " bytes. FileLength:" + indexFileSystem.getFileStatus(indexPath).getLen());
        // read in index header data upfront 
        byte indexHeaderData[] = new byte[totalRecordCount * 4];
        // read it 
        indexStream.readFully(indexHeaderData);
        // mark string start pos 
        long detailStartPos = indexStream.getPos();
        // initialize index header reader stream 
        DataInputBuffer indexHeaderStream = new DataInputBuffer();
        indexHeaderStream.reset(indexHeaderData, 0, indexHeaderData.length);

        resultOut.getResults().clear();
        resultOut.setPageNumber(pageNumber);
        resultOut.setTotalRecordCount(totalRecordCount);

        if (sortOrder == ClientQueryInfo.SortOrder.ASCENDING) {
            startPos = pageNumber * pageSize;
            endPos = Math.min(startPos + pageSize, totalRecordCount);
            offset = pageNumber * pageSize;
        } else {
            startPos = totalRecordCount - ((pageNumber + 1) * pageSize);
            endPos = startPos + pageSize;
            startPos = Math.max(0, startPos);
            offset = totalRecordCount - ((pageNumber + 1) * pageSize);
        }
        //LOG.info("readPaginatedResults called on Index with sortOrder:" + sortOrder + " pageNumber: " + pageNumber + " pageSize:" + pageSize + " offset is:" + offset);
        if (startPos < totalRecordCount) {

            //LOG.info("Seeking to Offset:" + startPos);
            indexHeaderStream.skip(startPos * 4);
            //LOG.info("Reading from:"+ startPos + " to:" + endPos + " (exclusive)");
            for (long i = startPos; i < endPos; ++i) {

                // read data offset ... 
                int domainDataPos = indexHeaderStream.readInt();
                // seek to it 
                indexStream.seek(detailStartPos + domainDataPos);
                // read the detail data  
                InlinkingDomainInfo domainInfo = new InlinkingDomainInfo();
                domainInfo.readFields(indexStream);
                // ok extract name 
                String domainName = domainInfo.getDomainName();
                if (domainName.length() == 0) {
                    //TODO: NEED TO TRACK THIS DOWN 
                    domainName = "<<OOPS-NULL>>";
                }
                Text key = new Text(domainName);
                domainInfo.setFieldClean(InlinkingDomainInfo.Field_DOMAINNAME);

                if (sortOrder == ClientQueryInfo.SortOrder.DESCENDING) {
                    resultOut.getResults().add(0, new QueryResultRecord<Writable, Writable>(key, domainInfo));
                } else {
                    resultOut.getResults().add(new QueryResultRecord<Writable, Writable>(key, domainInfo));
                }
            }
        }
    } finally {
        indexStream.close();
    }
}

From source file:org.commoncrawl.util.CharsetUtils.java

License:Open Source License

/** last resort - detect encoding using charset detector **/
public static String detectCharacterEncoding(byte[] contentBytes, int offset, int length,
        EncodingDetector detectorType) {

    if (contentBytes != null && length != 0) {

        if (detectorType == EncodingDetector.MOZILLA) {
            DetectorState state = new DetectorState();

            nsDetector detector = new nsDetector(nsPSMDetector.ALL);

            if (offset != 0) {
                int tempBufferLen = Math.min(length, MAX_CHARS_TO_DETECT);
                byte[] tempBuffer = new byte[tempBufferLen];
                System.arraycopy(contentBytes, offset, tempBuffer, 0, tempBufferLen);
                contentBytes = tempBuffer;
                offset = 0;/*from www  .  ja va 2s  .c om*/
                length = tempBufferLen;
            }

            detector.Init(state);

            boolean isAscii = detector.isAscii(contentBytes, length);

            if (!isAscii) {
                isAscii = detector.DoIt(contentBytes, Math.min(length, MAX_CHARS_TO_DETECT), false);
            }
            detector.DataEnd();

            if (isAscii) {
                return null;
            } else if (state._detectedCharset != null) {
                return state._detectedCharset;
            } else {
                String prob[] = detector.getProbableCharsets();
                if (prob != null && prob.length != 0) {
                    return prob[0];
                }
            }
        } else {
            // instantiate icu charset detector ... 
            CharsetDetector detector = new CharsetDetector();
            DataInputBuffer buffer = new DataInputBuffer();
            buffer.reset(contentBytes, offset, length);
            try {
                detector.setText(buffer);
                CharsetMatch matches[] = detector.detectAll();
                if (matches != null && matches.length != 0) {
                    int kThresold = 10;
                    CharsetMatch bestMatch = null;
                    for (int i = 0; i < matches.length; ++i) {
                        if (bestMatch == null || matches[i].getConfidence() > bestMatch.getConfidence()) {
                            bestMatch = matches[i];
                        }
                    }
                    if (bestMatch != null) {
                        return bestMatch.getName();
                    } else {
                        return matches[0].getName();
                    }
                }
            } catch (Exception e) {
                LOG.error(CCStringUtils.stringifyException(e));
            } finally {
            }
        }
    }
    return null;
}

From source file:org.commoncrawl.util.CompressedURLFPListV2.java

License:Open Source License

public static void main(String[] args) {

    // initialize ...
    final Configuration conf = new Configuration();

    conf.addResource("nutch-default.xml");
    conf.addResource("nutch-site.xml");
    conf.addResource("core-site.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("mapred-site.xml");

    BasicConfigurator.configure();//from ww  w  . ja  v a 2  s.co  m
    CrawlEnvironment.setHadoopConfig(conf);

    try {
        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

        Path testFile = new Path("crawl/linkdb/merged1282844121161/linkData/part-00000");
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, testFile, conf);

        URLFPV2 fp = new URLFPV2();
        BytesWritable bytes = new BytesWritable();

        while (reader.next(fp, bytes)) {
            if (bytes.getLength() != 0) {
                DataInputBuffer inputStream = new DataInputBuffer();
                inputStream.reset(bytes.get(), bytes.getLength());
                CompressedURLFPListV2.Reader listReader = new CompressedURLFPListV2.Reader(inputStream);
                while (listReader.hasNext()) {
                    URLFPV2 nextFP = listReader.next();
                    LOG.info("DH:" + nextFP.getDomainHash() + " UH:" + nextFP.getUrlHash());
                }
            } else {
                LOG.error("ZERO BYTE LIST!");
            }
        }

        reader.close();
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    }

    if (1 == 1)
        return;

    validateDuplicateChecking();
    // validateReallyBigList();
    validateURLFPSerializationRootDomain();
    validateURLFPSerializationSingleSubDomain();
    validateURLFPSerializationMultiDomain();
    validateURLFPFlagSerializationRootDomain();
    validateURLFPFlagSerializationMultipleSubDomains();
    validateURLFPFlagSerializationOneSubDomain();
}

From source file:org.commoncrawl.util.MultiFileMergeUtils.java

License:Open Source License

static void scanToItemThenDisplayNext(FileSystem fs, Path path, Configuration conf, URLFPV2 targetItem)
        throws IOException {
    DataOutputBuffer rawKey = new DataOutputBuffer();
    DataInputBuffer keyDataStream = new DataInputBuffer();

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
    ValueBytes valueBytes = reader.createValueBytes();

    int i = 0;//from  w  ww  .j  av a  2  s. c om
    while (reader.nextRawKey(rawKey) != -1) {
        URLFPV2 keyObject = new URLFPV2();
        keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength());
        keyObject.readFields(keyDataStream);
        rawKey.reset();
        reader.nextRawValue(valueBytes);

        if (keyObject.compareTo(targetItem) == 0) {

            reader.nextRawKey(rawKey);
            URLFPV2 nextKeyObject = new URLFPV2();
            keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength());
            nextKeyObject.readFields(keyDataStream);
            LOG.info("Target Domain:" + targetItem.getDomainHash() + " FP:" + targetItem.getUrlHash()
                    + " NextDomain:" + nextKeyObject.getDomainHash() + " NextHash:"
                    + nextKeyObject.getUrlHash());
            break;
        }
    }
    reader.close();
}

From source file:org.commoncrawl.util.MultiFileMergeUtils.java

License:Open Source License

static void addFirstNFPItemsToSet(FileSystem fs, Path path, Configuration conf, Set<URLFPV2> outputSet,
        int nItems) throws IOException {
    DataOutputBuffer rawKey = new DataOutputBuffer();
    DataInputBuffer keyDataStream = new DataInputBuffer();

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
    ValueBytes valueBytes = reader.createValueBytes();

    int i = 0;/* www  .  j a v  a  2  s.  co  m*/
    while (reader.nextRawKey(rawKey) != -1) {
        URLFPV2 keyObject = new URLFPV2();
        keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength());
        keyObject.readFields(keyDataStream);
        outputSet.add(keyObject);
        rawKey.reset();
        reader.nextRawValue(valueBytes);

        if (++i == nItems) {
            break;
        }
    }
    reader.close();
}

From source file:org.commoncrawl.util.TextBytes.java

License:Open Source License

public static void main(String[] args) {
    // run some tests on the new code
    String aTestString = new String("A Test Strnig");
    // convert it to bytes
    byte bytes[] = aTestString.getBytes();
    // over allocate an array
    byte overAllocated[] = new byte[bytes.length * 2];
    // copy source
    System.arraycopy(bytes, 0, overAllocated, bytes.length, bytes.length);
    // now allocate a TextBytes
    TextBytes textBytes = new TextBytes();
    // set the overallocated buffer as the backing store
    textBytes.set(overAllocated, bytes.length, bytes.length);
    // convert it to string first
    String toString = textBytes.toString();
    // validate equal to original
    Assert.assertTrue(aTestString.equals(toString));
    // ok now write it to output buffer
    DataOutputBuffer outputBuffer = new DataOutputBuffer();
    // write string
    try {/* w w w.ja v  a 2 s. co  m*/
        textBytes.write(outputBuffer);
        // read length
        DataInputBuffer inputBuffer = new DataInputBuffer();
        inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size());
        int encodedLength = WritableUtils.readVInt(inputBuffer);
        // validate arrays match ...
        Assert.assertTrue(encodedLength == bytes.length);
        Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, outputBuffer.getData(),
                inputBuffer.getPosition(), outputBuffer.getLength() - inputBuffer.getPosition()), 0);
        // ok reset input buffer again ...
        inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size());
        // read in fields
        textBytes.readFields(inputBuffer);
        // ok see if we are not using the original backing store ...
        Assert.assertTrue(textBytes.getBytes() != overAllocated);
        // validate buffers match to original
        Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, textBytes.getBytes(),
                textBytes.getOffset(), textBytes.getLength()), 0);

    } catch (IOException e) {
        e.printStackTrace();
    }

}