Example usage for org.apache.hadoop.io DataInputBuffer DataInputBuffer

Introduction

In this page you can find the example usage for org.apache.hadoop.io DataInputBuffer DataInputBuffer.

Prototype

public DataInputBuffer()

Source Link

Document

Constructs a new empty buffer.

Usage

From source file:org.commoncrawl.service.queryserver.index.DatabaseIndexV2.java

License:Open Source License

public static void main(String[] args) {
    if (args.length != 3) {
        LOG.error("args: [candidate Timestamp] [drive count] [query string]");
    }//from w  w  w .  j av  a2  s  . c o  m

    // initialize ...
    final Configuration conf = new Configuration();

    conf.addResource("nutch-default.xml");
    conf.addResource("nutch-site.xml");
    conf.addResource("core-site.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("mapred-site.xml");

    BasicConfigurator.configure();
    CrawlEnvironment.setHadoopConfig(conf);

    long candidateTS = Long.parseLong(args[0]);
    int driveCount = Integer.parseInt(args[1]);
    String queryString = args[2];

    try {
        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

        MasterDatabaseIndex masterIndex = new MasterDatabaseIndex(conf, fs, driveCount, candidateTS, null);
        SlaveDatabaseIndex slaveIndex = new SlaveDatabaseIndex(conf, fs, candidateTS);

        // ok hit the domain against the master index first ...
        LOG.info("Querying master index for DomainId Given DomainName:" + queryString);
        long domainId = masterIndex.queryDomainIdGivenDomain(queryString);

        LOG.info("Querying master index for DomainMetadata Given DomainId:" + domainId);
        SubDomainMetadata subDomainMeta = masterIndex.queryDomainMetadataGivenDomainId(domainId);

        if (subDomainMeta != null) {

            LOG.info("Metadata is present. Deserializing");
            // dump some fields ...
            LOG.info("Domain:" + subDomainMeta.getDomainText() + " URLCount:" + subDomainMeta.getUrlCount()
                    + " FetchedCount:" + subDomainMeta.getFetchedCount() + " PageRankCount:"
                    + subDomainMeta.getHasPageRankCount());

            // ok time to dive into a url list ...

            // query for a list of urls sorted by name
            LOG.info("Querying for URLList for Domain BY PR");
            FlexBuffer urlListBufferByPR = slaveIndex.queryURLListSortedByPR(domainId);

            if (urlListBufferByPR != null) {

                // read the list ...
                DataInputBuffer readerStream = new DataInputBuffer();
                readerStream.reset(urlListBufferByPR.get(), urlListBufferByPR.getCount());
                int totalItemCount = urlListBufferByPR.getCount() / 8;
                System.out.println("List BY  PR totalCount:" + totalItemCount);

                // initialize a fingerprint object to use for queries ...
                URLFPV2 queryFP = new URLFPV2();

                queryFP.setDomainHash(domainId);

                DataInputBuffer metadataReaderStream = new DataInputBuffer();
                // iterate the first N items ranked by page rank
                for (int i = 0; i < Math.min(10, totalItemCount); ++i) {

                    queryFP.setUrlHash(readerStream.readLong());

                    // and for metadata
                    MetadataOut urlMetadata = masterIndex.queryMetadataAndURLGivenFP(queryFP);

                    if (urlMetadata != null) {

                        // decode the url
                        String url = urlMetadata.url.toString();

                        System.out.println("URL for FP:" + queryFP.getUrlHash() + " is:" + url);
                        if (urlMetadata.datumAndMetadataBytes.getLength() == 0) {
                            System.out.println("URL for FP:" + queryFP.getUrlHash() + " had no METADATA!!");
                        } else {

                            // explode metadata
                            CrawlDatumAndMetadata metadataObject = new CrawlDatumAndMetadata();

                            metadataReaderStream.reset(urlMetadata.datumAndMetadataBytes.getBytes(),
                                    urlMetadata.datumAndMetadataBytes.getOffset(),
                                    urlMetadata.datumAndMetadataBytes.getLength());
                            metadataObject.readFields(metadataReaderStream);

                            // ok at this point spit out stuff for this url
                            StringBuilder urlInfo = new StringBuilder();

                            urlInfo.append("    FetchStatus:"
                                    + CrawlDatum.getStatusName(metadataObject.getStatus()) + "\n");
                            urlInfo.append("    PageRank:" + metadataObject.getMetadata().getPageRank() + "\n");
                            urlInfo.append(
                                    "    ContentType:" + metadataObject.getMetadata().getContentType() + "\n");
                            urlInfo.append("    ArcFileInfoCount:"
                                    + metadataObject.getMetadata().getArchiveInfo().size());
                            if (metadataObject.getMetadata()
                                    .isFieldDirty(CrawlURLMetadata.Field_LINKDBFILENO)) {
                                urlInfo.append(
                                        "    HasLinkDataInfo:" + metadataObject.getMetadata().getLinkDBFileNo()
                                                + ":" + metadataObject.getMetadata().getLinkDBOffset());
                            }
                            if (metadataObject.getMetadata()
                                    .isFieldDirty(CrawlURLMetadata.Field_INVERSEDBFILENO)) {
                                urlInfo.append("    HasINVLinkDataInfo:"
                                        + metadataObject.getMetadata().getInverseDBFileNo() + ":"
                                        + metadataObject.getMetadata().getInverseDBOffset());
                            }
                            System.out.println(urlInfo.toString());

                            // now if inverse link data is present ..
                            if (metadataObject.getMetadata()
                                    .isFieldDirty(CrawlURLMetadata.Field_INVERSEDBFILENO)) {
                                // get it ...
                                System.out.println("Querying for Inlinks for FP:" + queryFP.getUrlHash());
                                FlexBuffer inlinks = slaveIndex.queryInlinksByFP(queryFP,
                                        metadataObject.getMetadata().getInverseDBFileNo(),
                                        metadataObject.getMetadata().getInverseDBOffset());

                                if (inlinks != null) {
                                    System.out.println("Found Inlink Buffer of Size:" + inlinks.getCount());
                                    FileSystem localFS = FileSystem.getLocal(conf);
                                    File testDir = new File("/tmp/dbIndexTest");
                                    File testFile = new File("/tmp/dbIndexTestFile");
                                    localFS.delete(new Path(testDir.getAbsolutePath()), true);
                                    localFS.delete(new Path(testFile.getAbsolutePath()), false);
                                    localFS.mkdirs(new Path(testDir.getAbsolutePath()));

                                    LOG.info("Creating Spill File of Inlinks");
                                    spillLinkDataIntoTempFileIndex(fs, localFS, conf, masterIndex, testDir,
                                            new Path(testFile.getAbsolutePath()), inlinks);
                                    LOG.info("Created Spill File of Inlinks");

                                    LOG.info("Reading Inlinks");
                                    // ok now open it up and dump the first few inlinks from the
                                    // spill file
                                    SequenceFile.Reader reader = new SequenceFile.Reader(localFS,
                                            new Path(testFile.getAbsolutePath()), conf);

                                    TextBytes key = new TextBytes();
                                    TriTextBytesTuple value = new TriTextBytesTuple();
                                    CrawlDatumAndMetadata metadata = new CrawlDatumAndMetadata();
                                    DataInputBuffer inputBuffer = new DataInputBuffer();

                                    try {
                                        int itemCount = 0;

                                        while (reader.next(key, value)) {

                                            if (value.getThirdValue().getLength() != 0) {
                                                inputBuffer.reset(value.getThirdValue().getBytes(), 0,
                                                        value.getThirdValue().getLength());
                                                metadata.readFields(inputBuffer);
                                                System.out.println("INLINK:" + key.toString()
                                                        + " METADATA STATUS:"
                                                        + CrawlDatum.getStatusName(metadata.getStatus()));
                                            } else {
                                                System.out.println("INLINK:" + key.toString() + " NOMETADATA");
                                            }

                                            if (++itemCount == 500) {
                                                break;
                                            }
                                        }
                                    } finally {
                                        reader.close();
                                    }

                                    LOG.info("Done Reding Inlinks");
                                }
                            }
                        }
                    } else {
                        LOG.error("Query for FP:" + queryFP.getUrlHash() + " returned NULL URL");
                    }
                }
            }
        }

    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    }
}

From source file:org.commoncrawl.service.queryserver.query.DomainListQuery.java

License:Open Source License

@Override
protected long executeLocal(FileSystem remoteFileSystem, Configuration conf,
        DatabaseIndexV2.MasterDatabaseIndex index, EventLoop eventLoop, File tempFirDir,
        QueryRequest<DomainListQueryInfo, Text, SubDomainMetadata> requestObject) throws IOException {

    Path mergeResultsPath = new Path(
            getLocalQueryResultsPathPrefix(requestObject) + getMergedResultsFileName());

    LOG.info("Execute Local called for Query:" + getQueryId() + " MergeResultsPath is:" + mergeResultsPath);

    // get a local file system object
    FileSystem localFileSystem = FileSystem.getLocal(conf);

    //LOG.info("Executing LocalQuery - checking if MergedFile:" + mergeResultsPath + " Exists");
    // if source merged results path does not exist ... 
    if (!localFileSystem.exists(mergeResultsPath)) {
        LOG.info("Execute Local for Query:" + getQueryId() + " Source MergeFile:" + mergeResultsPath
                + " Not Found. Checking for parts files");
        // collect parts ...
        Vector<Path> parts = new Vector<Path>();

        FileStatus fileStatusArray[] = remoteFileSystem
                .globStatus(new Path(getHDFSQueryResultsPath(), "part-*"));

        if (fileStatusArray.length == 0) {
            LOG.error("Execute Local for Query:" + getQueryId() + " FAILED. No Parts Files Found!");
            throw new IOException("Remote Component Part Files Not Found");
        }/* w w w  . j  a va2  s.c  om*/

        for (FileStatus part : fileStatusArray) {
            //LOG.info("Found Part:"+ part);
            parts.add(part.getPath());
        }

        LOG.info("Execute Local for Query:" + getQueryId() + " Initializing Merger");
        SequenceFileSpillWriter<Text, SubDomainMetadata> mergedFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>(
                localFileSystem, conf, mergeResultsPath, Text.class, SubDomainMetadata.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(mergeResultsPath)),
                false);

        try {
            SequenceFileMerger<Text, SubDomainMetadata> merger = new SequenceFileMerger<Text, SubDomainMetadata>(
                    remoteFileSystem, conf, parts, mergedFileSpillWriter, Text.class, SubDomainMetadata.class,

                    new RawKeyValueComparator<Text, SubDomainMetadata>() {

                        DataInputBuffer key1Stream = new DataInputBuffer();
                        DataInputBuffer key2Stream = new DataInputBuffer();

                        @Override
                        public int compareRaw(byte[] key1Data, int key1Offset, int key1Length, byte[] key2Data,
                                int key2Offset, int key2Length, byte[] value1Data, int value1Offset,
                                int value1Length, byte[] value2Data, int value2Offset, int value2Length)
                                throws IOException {

                            key1Stream.reset(key1Data, key1Offset, key1Length);
                            key2Stream.reset(key2Data, key2Offset, key2Length);

                            WritableUtils.readVInt(key1Stream);
                            WritableUtils.readVInt(key2Stream);

                            return BytesWritable.Comparator.compareBytes(key1Data, key1Stream.getPosition(),
                                    key1Length - key1Stream.getPosition(), key2Data, key2Stream.getPosition(),
                                    key2Length - key2Stream.getPosition());
                        }

                        @Override
                        public int compare(Text key1, SubDomainMetadata value1, Text key2,
                                SubDomainMetadata value2) {
                            return key1.compareTo(key2);
                        }

                    });

            try {
                LOG.info("Execute Local for Query:" + getQueryId() + " Running Merger");
                merger.mergeAndSpill(null);
                LOG.info("Execute Local for Query:" + getQueryId()
                        + " Merge Successfull.. Deleting Merge Inputs");
                for (Path inputPath : parts) {
                    remoteFileSystem.delete(inputPath, false);
                }
            } catch (IOException e) {
                LOG.error("Execute Local for Query:" + getQueryId() + " Merge Failed with Exception:"
                        + CCStringUtils.stringifyException(e));
                throw e;
            } finally {
                LOG.info("** CLOSING MERGER");
                merger.close();
            }
        } finally {
            LOG.info("** FLUSHING SPILLWRITER");
            mergedFileSpillWriter.close();
        }
    }

    // now check for query specific merge file ...
    Path queryResultsPath = new Path(getLocalQueryResultsPathPrefix(requestObject)
            + getOutputFileNameBasedOnSortByField(requestObject.getClientQueryInfo().getSortByField()));

    LOG.info("Execute Local for Query:" + getQueryId() + " Checking for QueryResultsPath:" + queryResultsPath);

    if (!localFileSystem.exists(queryResultsPath)) {

        LOG.info("Exectue Local for Query:" + getQueryId() + " Results File:" + queryResultsPath
                + " does not exist. Running sort and merge process");

        LOG.info("Execute Local for Query:" + getQueryId() + " Allocating SpillWriter with output to:"
                + queryResultsPath);
        // allocate a spill writer ...  
        SequenceFileSpillWriter<Text, SubDomainMetadata> sortedResultsFileSpillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>(
                localFileSystem, conf, queryResultsPath, Text.class, SubDomainMetadata.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(localFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(queryResultsPath)),
                false);

        try {

            LOG.info("Execute Local for Query:" + getQueryId() + " Allocating MergeSortSpillWriter");
            // and connect it to the merge spill writer ...
            MergeSortSpillWriter<Text, SubDomainMetadata> mergeSortSpillWriter = new MergeSortSpillWriter<Text, SubDomainMetadata>(
                    conf, sortedResultsFileSpillWriter, localFileSystem, new Path(tempFirDir.getAbsolutePath()),
                    /*
                    new RawKeyValueComparator<Text,SubDomainMetadata>() {
                            
                      SubDomainMetadata value1 = new SubDomainMetadata();
                      SubDomainMetadata value2 = new SubDomainMetadata();
                              
                            
                      @Override
                      public int compare(Text key1, SubDomainMetadata value1, Text key2,SubDomainMetadata value2) {
                        return value1.getUrlCount() - value2.getUrlCount();
                      }
                            
                      @Override
                      public int compareRaw(byte[] key1Data, int key1Offset,
                          int key1Length, byte[] key2Data, int key2Offset,
                          int key2Length, byte[] value1Data, int value1Offset,
                          int value1Length, byte[] value2Data, int value2Offset,
                          int value2Length) throws IOException {
                            
                        value1.clear();
                        value2.clear();
                                
                        value1.readFields(new DataInputStream(new ByteArrayInputStream(value1Data,value1Offset,value1Length)));
                        value2.readFields(new DataInputStream(new ByteArrayInputStream(value2Data,value2Offset,value2Length)));
                                
                        return compare(null, value1, null, value2);
                      } 
                              
                    },
                    */
                    new OptimizedKeyGeneratorAndComparator<Text, SubDomainMetadata>() {

                        @Override
                        public void generateOptimizedKeyForPair(Text key, SubDomainMetadata value,
                                org.commoncrawl.hadoop.mergeutils.OptimizedKeyGeneratorAndComparator.OptimizedKey optimizedKeyOut)
                                throws IOException {
                            optimizedKeyOut.setLongKeyValue(value.getUrlCount());
                        }

                        @Override
                        public int getGeneratedKeyType() {
                            return OptimizedKey.KEY_TYPE_LONG;
                        }
                    }, Text.class, SubDomainMetadata.class, false, null);

            try {

                // create a vector representing the single input segment 
                Vector<Path> singleInputSegment = new Vector<Path>();

                LOG.info("Execute Local for Query:" + getQueryId() + " Adding MergeResultsPath:"
                        + mergeResultsPath + " as input for Merger");
                singleInputSegment.add(mergeResultsPath);

                // create a SequenceFileReader
                SequenceFileReader<Text, SubDomainMetadata> mergeSegmentReader = new SequenceFileReader<Text, SubDomainMetadata>(
                        localFileSystem, conf, singleInputSegment, mergeSortSpillWriter, Text.class,
                        SubDomainMetadata.class);

                try {
                    LOG.info("Execute Local for Query:" + getQueryId() + " calling readAndSpill");
                    mergeSegmentReader.readAndSpill();
                    LOG.info("Execute Local for Query:" + getQueryId() + " readAndSpill finished");
                } finally {
                    if (mergeSegmentReader != null) {
                        mergeSegmentReader.close();
                    }
                }

            } finally {
                if (mergeSortSpillWriter != null) {
                    mergeSortSpillWriter.close();
                }
            }

        } finally {
            if (sortedResultsFileSpillWriter != null) {
                sortedResultsFileSpillWriter.close();
            }
        }
    }

    //LOG.info("Allocating SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath);
    PositionBasedSequenceFileIndex<Text, SubDomainMetadata> indexFile = new PositionBasedSequenceFileIndex<Text, SubDomainMetadata>(
            localFileSystem, queryResultsPath, Text.class, SubDomainMetadata.class);
    //LOG.info("SequenceFileIndex object for DomainListQuery Id:" + getQueryId() + " with Path:" + queryResultsPath + " returned record count:" + indexFile.getRecordCount());

    return indexFile.getRecordCount();
}

From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java

License:Open Source License

static void collectAllTopLevelDomainRecordsByDomain(FileSystem fs, Configuration conf, long databaseId,
        long targetRootDomainFP, FileSystem outputFileSystem, Path finalOutputPath) throws IOException {

    File tempFile = new File("/tmp/inverseLinksReport-" + System.currentTimeMillis());
    tempFile.mkdir();//from w  w  w .  j  a  v  a  2  s  .  c o  m

    try {
        // create the final output spill writer ...  
        SequenceFileSpillWriter<FlexBuffer, URLFPV2> spillwriter = new SequenceFileSpillWriter<FlexBuffer, URLFPV2>(
                outputFileSystem, conf, finalOutputPath, FlexBuffer.class, URLFPV2.class,
                new PositionBasedSequenceFileIndex.PositionBasedIndexWriter(outputFileSystem,
                        PositionBasedSequenceFileIndex.getIndexNameFromBaseName(finalOutputPath)),
                true);

        try {

            MergeSortSpillWriter<FlexBuffer, URLFPV2> finalMerger = new MergeSortSpillWriter<FlexBuffer, URLFPV2>(
                    conf, spillwriter, FileSystem.getLocal(conf), new Path(tempFile.getAbsolutePath()), null,
                    new ComplexKeyComparator(), FlexBuffer.class, URLFPV2.class, true, null);

            try {

                for (int targetShardId = 0; targetShardId < CrawlEnvironment.NUM_DB_SHARDS; ++targetShardId) {
                    // 0. shard domain id to find index file location ... 
                    int indexShardId = (int) ((targetRootDomainFP & Integer.MAX_VALUE)
                            % CrawlEnvironment.NUM_DB_SHARDS);
                    // build path to index file 
                    Path indexFilePath = new Path("crawl/inverseLinkDB_ByDomain/" + databaseId
                            + "/phase3Data/part-" + NUMBER_FORMAT.format(indexShardId));
                    LOG.info("rootDomain is:" + targetRootDomainFP + " ShardId:" + indexShardId + " Index Path:"
                            + indexFilePath);
                    // 1. scan domainFP to index file first
                    // 2. given index, scan index->pos file to find scan start position
                    // 3. given scan start position, scan forward until fp match is found.
                    // 4. collect all matching entries and output to a file ? 

                    FSDataInputStream indexDataInputStream = fs.open(indexFilePath);
                    try {
                        TFile.Reader reader = new TFile.Reader(indexDataInputStream,
                                fs.getLength(indexFilePath), conf);
                        try {
                            TFile.Reader.Scanner scanner = reader.createScanner();

                            try {
                                // generate key ... 
                                DataOutputBuffer keyBuffer = new DataOutputBuffer();
                                keyBuffer.writeLong(targetRootDomainFP);
                                if (scanner.seekTo(keyBuffer.getData(), 0, keyBuffer.getLength())) {
                                    // setup for value scan 
                                    DataInputStream valueStream = scanner.entry().getValueStream();
                                    int dataOffsetOut = -1;
                                    while (valueStream.available() > 0) {
                                        // read entries looking for our specific entry
                                        int shardIdx = valueStream.readInt();
                                        int dataOffset = valueStream.readInt();
                                        if (shardIdx == targetShardId) {
                                            dataOffsetOut = dataOffset;
                                            break;
                                        }
                                    }
                                    LOG.info("Index Search Yielded:" + dataOffsetOut);
                                    if (dataOffsetOut != -1) {
                                        // ok create a data path 
                                        Path finalDataPath = new Path("crawl/inverseLinkDB_ByDomain/"
                                                + databaseId + "/phase2Data/data-"
                                                + NUMBER_FORMAT.format(targetShardId));
                                        Path finalDataIndexPath = new Path("crawl/inverseLinkDB_ByDomain/"
                                                + databaseId + "/phase2Data/data-"
                                                + NUMBER_FORMAT.format(targetShardId) + ".index");
                                        // check to see if index is already loaded ... 
                                        PositionBasedSequenceFileIndex<FlexBuffer, TextBytes> index = null;
                                        synchronized (_shardToIndexMap) {
                                            index = _shardToIndexMap.get(targetShardId);
                                        }
                                        if (index == null) {
                                            LOG.info("Loading Index from Path:" + finalDataIndexPath);
                                            // load index
                                            index = new PositionBasedSequenceFileIndex<FlexBuffer, TextBytes>(
                                                    fs, finalDataIndexPath, FlexBuffer.class, TextBytes.class);
                                            // put in cache
                                            synchronized (_shardToIndexMap) {
                                                _shardToIndexMap.put(targetShardId, index);
                                            }
                                        }

                                        LOG.info("Initializing Data Reader at Path:" + finalDataPath);
                                        // ok time to create a reader 
                                        SequenceFile.Reader dataReader = new SequenceFile.Reader(fs,
                                                finalDataPath, conf);

                                        try {
                                            LOG.info("Seeking Reader to Index Position:" + dataOffsetOut);
                                            index.seekReaderToItemAtIndex(dataReader, dataOffsetOut);

                                            FlexBuffer keyBytes = new FlexBuffer();
                                            URLFPV2 sourceFP = new URLFPV2();
                                            DataInputBuffer keyReader = new DataInputBuffer();
                                            TextBytes urlTxt = new TextBytes();

                                            // ok read to go ... 
                                            while (dataReader.next(keyBytes, sourceFP)) {
                                                // initialize reader 
                                                keyReader.reset(keyBytes.get(), keyBytes.getOffset(),
                                                        keyBytes.getCount());

                                                long targetFP = keyReader.readLong();

                                                if (targetRootDomainFP == targetFP) {
                                                    finalMerger.spillRecord(keyBytes, sourceFP);
                                                } else {
                                                    LOG.info("FP:" + targetFP + " > TargetFP:"
                                                            + targetRootDomainFP + " Exiting Iteration Loop");
                                                    break;
                                                }
                                            }
                                        } finally {
                                            LOG.info("Closing Reader");
                                            dataReader.close();
                                        }
                                    }
                                }
                            } finally {
                                LOG.info("Closing Scanner");
                                scanner.close();
                            }

                        } finally {
                            LOG.info("Closing TFile Reader");
                            reader.close();
                        }
                    } finally {
                        LOG.info("Closing InputStream");
                        indexDataInputStream.close();
                    }
                }
            } finally {
                finalMerger.close();
            }
        } finally {
            spillwriter.close();
        }
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
        FileUtils.recursivelyDeleteFile(tempFile);
    }

}

From source file:org.commoncrawl.service.queryserver.query.InverseLinksByDomainQuery.java

License:Open Source License

public static void main(String[] args) {
    // initialize ...
    Configuration conf = new Configuration();

    conf.addResource("nutch-default.xml");
    conf.addResource("nutch-site.xml");
    conf.addResource("core-site.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("mapred-site.xml");

    LOG.info("URL:" + args[0] + " ShardId:" + args[1]);

    try {//from  w w  w  .  ja va 2s.c om
        File tempFile = File.createTempFile("inverseLinksReportTest", "seq");
        try {
            FileSystem fs = FileSystem.get(conf);
            FileSystem localFileSystem = FileSystem.getLocal(conf);

            URLFPV2 fp = URLUtils.getURLFPV2FromURL(args[0]);
            if (fp != null) {
                collectAllTopLevelDomainRecordsByDomain(fs, conf, 1282844121161L, fp.getRootDomainHash(),
                        localFileSystem, new Path(tempFile.getAbsolutePath()));

                SequenceFile.Reader reader = new SequenceFile.Reader(localFileSystem,
                        new Path(tempFile.getAbsolutePath()), conf);
                try {
                    FlexBuffer key = new FlexBuffer();
                    URLFPV2 src = new URLFPV2();
                    TextBytes url = new TextBytes();

                    DataInputBuffer inputBuffer = new DataInputBuffer();

                    while (reader.next(key, src)) {
                        inputBuffer.reset(key.get(), key.getOffset(), key.getCount());
                        long targetFP = inputBuffer.readLong();
                        float pageRank = inputBuffer.readFloat();
                        // ok initialize text bytes ... 
                        int textLen = WritableUtils.readVInt(inputBuffer);
                        url.set(key.get(), inputBuffer.getPosition(), textLen);
                        LOG.info("PR:" + pageRank + " URL:" + url.toString());
                    }
                } finally {
                    reader.close();
                }
            }
        } catch (IOException e) {
            LOG.error(CCStringUtils.stringifyException(e));
            // tempFile.delete();
        }
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    }
}

From source file:org.commoncrawl.service.queryserver.query.URLLinksQuery.java

License:Open Source License

private static void readPaginatedInlinkingDomainInfo(final DatabaseIndexV2.MasterDatabaseIndex masterIndex,
        FileSystem indexFileSystem, Path indexPath, Path detailPath, int sortOrder, int pageNumber,
        int pageSize, QueryResult<Writable, Writable> resultOut) throws IOException {
    // if descending sort order ... 
    // take pageNumber * pageSize as starting point
    long offset = 0;
    long startPos = 0;
    long endPos = 0;

    FSDataInputStream indexStream = indexFileSystem.open(indexPath);

    try {/*from w  w w  .j ava2  s.  c om*/

        // read in the total record count ... 
        int totalRecordCount = indexStream.readInt();

        LOG.info("***RecordCount:" + totalRecordCount + " Allocating Buffer Of:" + (totalRecordCount * 4)
                + " bytes. FileLength:" + indexFileSystem.getFileStatus(indexPath).getLen());
        // read in index header data upfront 
        byte indexHeaderData[] = new byte[totalRecordCount * 4];
        // read it 
        indexStream.readFully(indexHeaderData);
        // mark string start pos 
        long detailStartPos = indexStream.getPos();
        // initialize index header reader stream 
        DataInputBuffer indexHeaderStream = new DataInputBuffer();
        indexHeaderStream.reset(indexHeaderData, 0, indexHeaderData.length);

        resultOut.getResults().clear();
        resultOut.setPageNumber(pageNumber);
        resultOut.setTotalRecordCount(totalRecordCount);

        if (sortOrder == ClientQueryInfo.SortOrder.ASCENDING) {
            startPos = pageNumber * pageSize;
            endPos = Math.min(startPos + pageSize, totalRecordCount);
            offset = pageNumber * pageSize;
        } else {
            startPos = totalRecordCount - ((pageNumber + 1) * pageSize);
            endPos = startPos + pageSize;
            startPos = Math.max(0, startPos);
            offset = totalRecordCount - ((pageNumber + 1) * pageSize);
        }
        //LOG.info("readPaginatedResults called on Index with sortOrder:" + sortOrder + " pageNumber: " + pageNumber + " pageSize:" + pageSize + " offset is:" + offset);
        if (startPos < totalRecordCount) {

            //LOG.info("Seeking to Offset:" + startPos);
            indexHeaderStream.skip(startPos * 4);
            //LOG.info("Reading from:"+ startPos + " to:" + endPos + " (exclusive)");
            for (long i = startPos; i < endPos; ++i) {

                // read data offset ... 
                int domainDataPos = indexHeaderStream.readInt();
                // seek to it 
                indexStream.seek(detailStartPos + domainDataPos);
                // read the detail data  
                InlinkingDomainInfo domainInfo = new InlinkingDomainInfo();
                domainInfo.readFields(indexStream);
                // ok extract name 
                String domainName = domainInfo.getDomainName();
                if (domainName.length() == 0) {
                    //TODO: NEED TO TRACK THIS DOWN 
                    domainName = "<<OOPS-NULL>>";
                }
                Text key = new Text(domainName);
                domainInfo.setFieldClean(InlinkingDomainInfo.Field_DOMAINNAME);

                if (sortOrder == ClientQueryInfo.SortOrder.DESCENDING) {
                    resultOut.getResults().add(0, new QueryResultRecord<Writable, Writable>(key, domainInfo));
                } else {
                    resultOut.getResults().add(new QueryResultRecord<Writable, Writable>(key, domainInfo));
                }
            }
        }
    } finally {
        indexStream.close();
    }
}

From source file:org.commoncrawl.util.CharsetUtils.java

License:Open Source License

/** last resort - detect encoding using charset detector **/
public static String detectCharacterEncoding(byte[] contentBytes, int offset, int length,
        EncodingDetector detectorType) {

    if (contentBytes != null && length != 0) {

        if (detectorType == EncodingDetector.MOZILLA) {
            DetectorState state = new DetectorState();

            nsDetector detector = new nsDetector(nsPSMDetector.ALL);

            if (offset != 0) {
                int tempBufferLen = Math.min(length, MAX_CHARS_TO_DETECT);
                byte[] tempBuffer = new byte[tempBufferLen];
                System.arraycopy(contentBytes, offset, tempBuffer, 0, tempBufferLen);
                contentBytes = tempBuffer;
                offset = 0;/*from www  .  ja va 2s  .c om*/
                length = tempBufferLen;
            }

            detector.Init(state);

            boolean isAscii = detector.isAscii(contentBytes, length);

            if (!isAscii) {
                isAscii = detector.DoIt(contentBytes, Math.min(length, MAX_CHARS_TO_DETECT), false);
            }
            detector.DataEnd();

            if (isAscii) {
                return null;
            } else if (state._detectedCharset != null) {
                return state._detectedCharset;
            } else {
                String prob[] = detector.getProbableCharsets();
                if (prob != null && prob.length != 0) {
                    return prob[0];
                }
            }
        } else {
            // instantiate icu charset detector ... 
            CharsetDetector detector = new CharsetDetector();
            DataInputBuffer buffer = new DataInputBuffer();
            buffer.reset(contentBytes, offset, length);
            try {
                detector.setText(buffer);
                CharsetMatch matches[] = detector.detectAll();
                if (matches != null && matches.length != 0) {
                    int kThresold = 10;
                    CharsetMatch bestMatch = null;
                    for (int i = 0; i < matches.length; ++i) {
                        if (bestMatch == null || matches[i].getConfidence() > bestMatch.getConfidence()) {
                            bestMatch = matches[i];
                        }
                    }
                    if (bestMatch != null) {
                        return bestMatch.getName();
                    } else {
                        return matches[0].getName();
                    }
                }
            } catch (Exception e) {
                LOG.error(CCStringUtils.stringifyException(e));
            } finally {
            }
        }
    }
    return null;
}

From source file:org.commoncrawl.util.CompressedURLFPListV2.java

License:Open Source License

public static void main(String[] args) {

    // initialize ...
    final Configuration conf = new Configuration();

    conf.addResource("nutch-default.xml");
    conf.addResource("nutch-site.xml");
    conf.addResource("core-site.xml");
    conf.addResource("hdfs-site.xml");
    conf.addResource("mapred-site.xml");

    BasicConfigurator.configure();//from ww  w  . ja  v a 2  s.co  m
    CrawlEnvironment.setHadoopConfig(conf);

    try {
        FileSystem fs = CrawlEnvironment.getDefaultFileSystem();

        Path testFile = new Path("crawl/linkdb/merged1282844121161/linkData/part-00000");
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, testFile, conf);

        URLFPV2 fp = new URLFPV2();
        BytesWritable bytes = new BytesWritable();

        while (reader.next(fp, bytes)) {
            if (bytes.getLength() != 0) {
                DataInputBuffer inputStream = new DataInputBuffer();
                inputStream.reset(bytes.get(), bytes.getLength());
                CompressedURLFPListV2.Reader listReader = new CompressedURLFPListV2.Reader(inputStream);
                while (listReader.hasNext()) {
                    URLFPV2 nextFP = listReader.next();
                    LOG.info("DH:" + nextFP.getDomainHash() + " UH:" + nextFP.getUrlHash());
                }
            } else {
                LOG.error("ZERO BYTE LIST!");
            }
        }

        reader.close();
    } catch (IOException e) {
        LOG.error(CCStringUtils.stringifyException(e));
    }

    if (1 == 1)
        return;

    validateDuplicateChecking();
    // validateReallyBigList();
    validateURLFPSerializationRootDomain();
    validateURLFPSerializationSingleSubDomain();
    validateURLFPSerializationMultiDomain();
    validateURLFPFlagSerializationRootDomain();
    validateURLFPFlagSerializationMultipleSubDomains();
    validateURLFPFlagSerializationOneSubDomain();
}

From source file:org.commoncrawl.util.MultiFileMergeUtils.java

License:Open Source License

static void scanToItemThenDisplayNext(FileSystem fs, Path path, Configuration conf, URLFPV2 targetItem)
        throws IOException {
    DataOutputBuffer rawKey = new DataOutputBuffer();
    DataInputBuffer keyDataStream = new DataInputBuffer();

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
    ValueBytes valueBytes = reader.createValueBytes();

    int i = 0;//from  w  ww  .j  av a  2  s. c om
    while (reader.nextRawKey(rawKey) != -1) {
        URLFPV2 keyObject = new URLFPV2();
        keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength());
        keyObject.readFields(keyDataStream);
        rawKey.reset();
        reader.nextRawValue(valueBytes);

        if (keyObject.compareTo(targetItem) == 0) {

            reader.nextRawKey(rawKey);
            URLFPV2 nextKeyObject = new URLFPV2();
            keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength());
            nextKeyObject.readFields(keyDataStream);
            LOG.info("Target Domain:" + targetItem.getDomainHash() + " FP:" + targetItem.getUrlHash()
                    + " NextDomain:" + nextKeyObject.getDomainHash() + " NextHash:"
                    + nextKeyObject.getUrlHash());
            break;
        }
    }
    reader.close();
}

From source file:org.commoncrawl.util.MultiFileMergeUtils.java

License:Open Source License

static void addFirstNFPItemsToSet(FileSystem fs, Path path, Configuration conf, Set<URLFPV2> outputSet,
        int nItems) throws IOException {
    DataOutputBuffer rawKey = new DataOutputBuffer();
    DataInputBuffer keyDataStream = new DataInputBuffer();

    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
    ValueBytes valueBytes = reader.createValueBytes();

    int i = 0;/* www  .  j a v  a  2  s.  co  m*/
    while (reader.nextRawKey(rawKey) != -1) {
        URLFPV2 keyObject = new URLFPV2();
        keyDataStream.reset(rawKey.getData(), 0, rawKey.getLength());
        keyObject.readFields(keyDataStream);
        outputSet.add(keyObject);
        rawKey.reset();
        reader.nextRawValue(valueBytes);

        if (++i == nItems) {
            break;
        }
    }
    reader.close();
}

From source file:org.commoncrawl.util.TextBytes.java

License:Open Source License

public static void main(String[] args) {
    // run some tests on the new code
    String aTestString = new String("A Test Strnig");
    // convert it to bytes
    byte bytes[] = aTestString.getBytes();
    // over allocate an array
    byte overAllocated[] = new byte[bytes.length * 2];
    // copy source
    System.arraycopy(bytes, 0, overAllocated, bytes.length, bytes.length);
    // now allocate a TextBytes
    TextBytes textBytes = new TextBytes();
    // set the overallocated buffer as the backing store
    textBytes.set(overAllocated, bytes.length, bytes.length);
    // convert it to string first
    String toString = textBytes.toString();
    // validate equal to original
    Assert.assertTrue(aTestString.equals(toString));
    // ok now write it to output buffer
    DataOutputBuffer outputBuffer = new DataOutputBuffer();
    // write string
    try {/* w w w.ja v  a 2 s. co  m*/
        textBytes.write(outputBuffer);
        // read length
        DataInputBuffer inputBuffer = new DataInputBuffer();
        inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size());
        int encodedLength = WritableUtils.readVInt(inputBuffer);
        // validate arrays match ...
        Assert.assertTrue(encodedLength == bytes.length);
        Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, outputBuffer.getData(),
                inputBuffer.getPosition(), outputBuffer.getLength() - inputBuffer.getPosition()), 0);
        // ok reset input buffer again ...
        inputBuffer.reset(outputBuffer.getData(), 0, outputBuffer.size());
        // read in fields
        textBytes.readFields(inputBuffer);
        // ok see if we are not using the original backing store ...
        Assert.assertTrue(textBytes.getBytes() != overAllocated);
        // validate buffers match to original
        Assert.assertEquals(WritableComparator.compareBytes(bytes, 0, bytes.length, textBytes.getBytes(),
                textBytes.getOffset(), textBytes.getLength()), 0);

    } catch (IOException e) {
        e.printStackTrace();
    }

}