List of usage examples for java.util.concurrent Semaphore acquireUninterruptibly
public void acquireUninterruptibly()
From source file:org.commoncrawl.service.listcrawler.CacheManager.java
/********************************************************************************************************/ public static void main(String[] args) { final EventLoop eventLoop = new EventLoop(); eventLoop.start();/*from www . j av a 2 s . co m*/ final CacheManager manager = new CacheManager(eventLoop); // delete active log if it exists ... manager.getActiveLogFilePath().delete(); try { manager.initialize(INIT_FLAG_SKIP_CACHE_WRITER_INIT | INIT_FLAG_SKIP_HDFS_WRITER_INIT); } catch (IOException e1) { LOG.error(CCStringUtils.stringifyException(e1)); return; } MessageDigest digester; try { digester = MessageDigest.getInstance("MD5"); } catch (NoSuchAlgorithmException e1) { LOG.error(CCStringUtils.stringifyException(e1)); return; } final byte[] randomBytes = new byte[1 << 15]; LOG.info("Building Random Digest"); for (int i = 0; i < randomBytes.length; i += 16) { long time = System.nanoTime(); digester.update((new UID() + "@" + time).getBytes()); System.arraycopy(digester.digest(), 0, randomBytes, i, 16); } final Semaphore semaphore = new Semaphore(0); if (args[0].equals("populate")) { manager.startCacheWriterThread(); manager.startHDFSFlusherThread(); try { LOG.info("Done Building Random Digest"); LOG.info("Writing Items To Disk"); for (int i = 0; i < 1000000; ++i) { if (i % 1000 == 0) { LOG.info("Wrote:" + i + " entries"); } final CacheItem item1 = new CacheItem(); item1.setUrl(manager.normalizeURL("http://www.domain.com/foobar/" + i)); item1.setContent(new Buffer(randomBytes)); item1.setUrlFingerprint(URLFingerprint.generate64BitURLFPrint(item1.getUrl())); manager.cacheItem(item1, null); Thread.sleep(1); if (i != 0 && i % 10000 == 0) { LOG.info("Hit 10000 items.. sleeping for 20 seconds"); Thread.sleep(20 * 1000); } } Thread.sleep(30000); for (int i = 0; i < 1000000; ++i) { final String url = new String("http://www.domain.com/foobar/" + i); manager.checkCacheForItem(url, new CacheItemCheckCallback() { @Override public void cacheItemAvailable(String url, CacheItem item) { Assert.assertTrue(item.getUrl().equals(url)); String itemIndex = url.substring("http://www.domain.com/foobar/".length()); int itemNumber = Integer.parseInt(itemIndex); if (itemNumber == 999999) { semaphore.release(); } } @Override public void cacheItemNotFound(String url) { Assert.assertTrue(false); } }); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } catch (InterruptedException e2) { } } else if (args[0].equals("read")) { try { final CacheItem item1 = new CacheItem(); item1.setUrl(manager.normalizeURL("http://www.domain.com/barz/")); item1.setUrlFingerprint(URLFingerprint.generate64BitURLFPrint(item1.getUrl())); item1.setContent(new Buffer(randomBytes)); manager.cacheItem(item1, null); // queue up cache load requests .... for (int i = 0; i < 10000; ++i) { final String url = new String("http://www.domain.com/foobar/" + i); eventLoop.setTimer(new Timer(1, false, new Timer.Callback() { @Override public void timerFired(Timer timer) { manager.checkCacheForItem(url, new CacheItemCheckCallback() { @Override public void cacheItemAvailable(String url, CacheItem item) { LOG.info("FOUND Item for URL:" + url + " ContentSize:" + item.getContent().getCount()); } @Override public void cacheItemNotFound(String url) { LOG.info("DIDNOT Find Item for URL:" + url); } }); } })); } eventLoop.setTimer(new Timer(1, false, new Timer.Callback() { @Override public void timerFired(Timer timer) { manager.checkCacheForItem(item1.getUrl(), new CacheItemCheckCallback() { @Override public void cacheItemAvailable(String url, CacheItem item) { LOG.info("FOUND Item for URL:" + url + " ContentSize:" + item.getContent().getCount()); } @Override public void cacheItemNotFound(String url) { LOG.info("DIDNOT Find Item for URL:" + url); } }); } })); } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } } semaphore.acquireUninterruptibly(); }
From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java
private static void testWriteMapFileToHDFS(EventLoop eventLoop) { try {//from w ww. j ava 2 s. co m // initialize log manager CrawlHistoryManager logManager = initializeTestLogManager(eventLoop, true); // initialize item list TreeMap<URLFP, ProxyCrawlHistoryItem> items = buildTestList(urlList1); final TreeMap<String, URLFP> urlToURLFPMap = new TreeMap<String, URLFP>(); for (Map.Entry<URLFP, ProxyCrawlHistoryItem> item : items.entrySet()) { urlToURLFPMap.put(item.getValue().getOriginalURL(), item.getKey()); } // add to local item map in log manager for (ProxyCrawlHistoryItem item : items.values()) { logManager.appendItemToLog(item); } // ok shutdown log manager ... logManager.shutdown(); // restart - reload log file ... logManager = initializeTestLogManager(eventLoop, false); // write to 'hdfs' logManager.doCheckpoint(); syncAndValidateItems(items, logManager); logManager.shutdown(); // restart logManager = initializeTestLogManager(eventLoop, false); // tweak original items updateTestItemStates(items); // ok append items for (ProxyCrawlHistoryItem item : items.values()) { logManager.appendItemToLog(item); } syncAndValidateItems(items, logManager); // ok now checkpoint the items logManager.doCheckpoint(); // ok now validate one last time syncAndValidateItems(items, logManager); // shutown logManager.shutdown(); logManager = null; { // start from scratch ... final CrawlHistoryManager logManagerTest = initializeTestLogManager(eventLoop, true); // create a final version of the tree map reference final TreeMap<URLFP, ProxyCrawlHistoryItem> itemList = items; // create filename File urlInputFile = new File(logManagerTest.getLocalDataDir(), "testURLS-" + System.currentTimeMillis()); // ok create a crawl list from urls CrawlList.generateTestURLFile(urlInputFile, urlList1); long listId = logManagerTest.loadList(urlInputFile, 0); CrawlList listObject = logManagerTest.getList(listId); final Semaphore listCompletionSemaphore = new Semaphore(-(itemList.size() - 1)); listObject.setEventListener(new CrawlList.CrawlListEvents() { @Override public void itemUpdated(URLFP itemFingerprint) { // TODO Auto-generated method stub listCompletionSemaphore.release(); } }); // ok start the appropriate threads logManagerTest.startLogWriterThread(0); logManagerTest.startListLoaderThread(); logManagerTest.startQueueLoaderThread(new CrawlQueueLoader() { @Override public void queueURL(URLFP urlfp, String url) { logManagerTest.crawlComplete( proxyCrawlHitoryItemToCrawlURL(itemList.get(urlToURLFPMap.get(url)))); } @Override public void flush() { // TODO Auto-generated method stub } }); LOG.info("Waiting for Release"); // and wait for the finish listCompletionSemaphore.acquireUninterruptibly(); LOG.info("Got Here"); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } }
From source file:org.commoncrawl.service.listcrawler.CrawlHistoryManager.java
private static void launchInTestMode() { File baseTestDir = new File("/tmp/logManagerTest"); FileUtils.recursivelyDeleteFile(baseTestDir); baseTestDir.mkdir();/*w ww . ja v a 2s. c o m*/ File remoteDir = new File(baseTestDir, "remote"); File localDir = new File(baseTestDir, "local"); remoteDir.mkdir(); localDir.mkdir(); final TreeMap<String, URLFP> urlToFPMap = new TreeMap<String, URLFP>(); final TreeMap<URLFP, String> urlFPToString = new TreeMap<URLFP, String>(); Set<String> list1 = Sets.newHashSet(urlList1); Set<String> list2 = Sets.newHashSet(urlList2); final Set<String> combined = Sets.union(list1, list2); Set<String> difference = Sets.difference(list1, list2); final Set<String> completedURLS = new HashSet<String>(); for (String url : combined) { URLFP fingerprint = URLUtils.getURLFPFromURL(url, true); urlToFPMap.put(url, fingerprint); urlFPToString.put(fingerprint, url); } File testInputFile1 = new File(localDir, "INPUT_LIST-" + System.currentTimeMillis()); File testInputFile2 = new File(localDir, "INPUT_LIST-" + (System.currentTimeMillis() + 1)); try { generateTestURLFile(testInputFile1, urlList1); generateTestURLFile(testInputFile2, urlList2); FileSystem localFileSystem = FileSystem.getLocal(CrawlEnvironment.getHadoopConfig()); EventLoop eventLoop = new EventLoop(); eventLoop.start(); final CrawlHistoryManager logManager = new CrawlHistoryManager(localFileSystem, new Path(remoteDir.getAbsolutePath()), localDir, eventLoop, 0); final LinkedBlockingQueue<ProxyCrawlHistoryItem> queue = new LinkedBlockingQueue<ProxyCrawlHistoryItem>(); final Semaphore initialListComplete = new Semaphore(0); logManager.startQueueLoaderThread(new CrawlQueueLoader() { @Override public void queueURL(URLFP urlfp, String url) { ProxyCrawlHistoryItem item = new ProxyCrawlHistoryItem(); item.setOriginalURL(url); queue.add(item); } @Override public void flush() { // TODO Auto-generated method stub } }); Thread queueTestThread = new Thread(new Runnable() { @Override public void run() { while (true) { try { ProxyCrawlHistoryItem item = queue.take(); if (item.getOriginalURL().length() == 0) { break; } else { System.out.println("Got:" + item.getOriginalURL()); CrawlURL urlObject = new CrawlURL(); Assert.assertTrue(!completedURLS.contains(item.getOriginalURL())); completedURLS.add(item.getOriginalURL()); urlObject.setLastAttemptResult((byte) CrawlURL.CrawlResult.SUCCESS); urlObject.setUrl(item.getOriginalURL()); urlObject.setResultCode(200); logManager.crawlComplete(urlObject); if (completedURLS.equals(combined)) { System.out.println("Hit Trigger URL. Releasing InitialListComplete Sempahore"); initialListComplete.release(1); } } } catch (InterruptedException e) { } } } }); queueTestThread.start(); logManager.loadList(testInputFile1, 0); logManager.loadList(testInputFile2, 0); System.out.println("Waiting for Initial List to Complete"); initialListComplete.acquireUninterruptibly(); System.out.println("Woke Up"); try { eventLoop.getEventThread().join(); } catch (InterruptedException e) { e.printStackTrace(); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } }
From source file:org.commoncrawl.service.listcrawler.DataTransferAgent.java
static int uploadSingeFile(CCBridgeServerMapping mapping, FileSystem fs, Configuration conf, Path hdfsFilePath, String uploadName, EventLoop eventLoop) throws IOException { final FileStatus fileStatus = fs.getFileStatus(hdfsFilePath); LOG.info("Uploading:" + uploadName + " size:" + fileStatus.getLen() + " to:" + mapping._internalName); {// w ww .ja v a2s .com // construct url URL fePostURL = new URL("http://" + mapping._externalName + ":8090/"); LOG.info("POST URL IS:" + fePostURL.toString()); // open input stream final FSDataInputStream is = fs.open(hdfsFilePath); final Semaphore blockingSemaphore = new Semaphore(0); NIOHttpConnection connection = null; try { // create connection connection = new NIOHttpConnection(fePostURL, eventLoop.getSelector(), eventLoop.getResolver(), null); // set listener connection.setListener(new Listener() { @Override public void HttpConnectionStateChanged(NIOHttpConnection theConnection, State oldState, State state) { LOG.info("Connection State Changed to:" + state.toString()); if (state == State.DONE || state == State.ERROR) { //LOG.info("Connection Transition to Done or Error"); //LOG.info("Response Headers:" + theConnection.getResponseHeaders().toString()); blockingSemaphore.release(); } } @Override public void HttpContentAvailable(NIOHttpConnection theConnection, NIOBufferList contentBuffer) { // TODO Auto-generated method stub } }); // set headers connection.getRequestHeaders().reset(); connection.getRequestHeaders().prepend("PUT /put?src=" + uploadName + " HTTP/1.1", null); connection.getRequestHeaders().set("Host", mapping._internalName + ":8090"); connection.getRequestHeaders().set("Content-Length", Long.toString(fileStatus.getLen())); connection.getRequestHeaders().set("Connection", "keep-alive"); connection.setPopulateDefaultHeaderItems(false); final LinkedBlockingDeque<BufferStruct> _loaderQueue = new LinkedBlockingDeque<BufferStruct>(20); final AtomicBoolean eof = new AtomicBoolean(); final ByteBuffer sentinel = ByteBuffer.allocate(4096); sentinel.position(sentinel.position()); final Thread loaderThread = new Thread(new Runnable() { int _id = 0; @Override public void run() { int bytesRead; byte incomingBuffer[] = new byte[4096 * 10]; try { while ((bytesRead = is.read(incomingBuffer)) != -1) { ByteBuffer buffer = ByteBuffer.wrap(incomingBuffer, 0, bytesRead); buffer.position(bytesRead); //LOG.info("Loader Thread Read:"+ bytesRead + " Buffer:" + ++_id); try { _loaderQueue.put(new BufferStruct(buffer, _id)); } catch (InterruptedException e) { LOG.error(CCStringUtils.stringifyException(e)); break; } incomingBuffer = new byte[4096 * 10]; } try { _loaderQueue.put(new BufferStruct(sentinel, ++_id)); } catch (InterruptedException e) { } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return; } } }); loaderThread.start(); // set data source ... connection.setDataSource(new DataSource() { int bytesTransferred = 0; @Override public boolean read(NIOBufferList dataBuffer) throws IOException { if (eof.get()) return true; //LOG.info("Connect read callback triggered"); BufferStruct buffer = _loaderQueue.poll(); if (buffer != null) { if (buffer._buffer != sentinel) { //LOG.info("Got Buffer:"+ buffer._id); if (buffer._id == 1) { //LOG.info("Inital Buffer Bytes:" + new String(buffer._buffer.array(),0,10).toString()); } dataBuffer.write(buffer._buffer); bytesTransferred += buffer._buffer.limit(); //LOG.info("Read:" + buffer._buffer.limit() + " Transfered:" + bytesTransferred); return false; } else { //LOG.info("EOF Condition"); dataBuffer.write(sentinel); eof.set(true); return true; } } return false; } }); // open connection connection.open(); // wait for connection to complete ... blockingSemaphore.acquireUninterruptibly(); // kill loader thread loaderThread.interrupt(); try { LOG.info("Waiting for Loader Thread"); loaderThread.join(); LOG.info("Done Waiting for Loader Thread"); } catch (InterruptedException e) { } } finally { is.close(); if (connection != null) { connection.close(); LOG.info("Response Code for File:" + uploadName + "to Host: " + mapping._internalName + " is:" + connection.getResponseHeaders().getHttpResponseCode()); return connection.getResponseHeaders().getHttpResponseCode(); /* if (connection.getResponseHeaders().getHttpResponseCode() != 200) { throw new IOException("Failed to upload file:" + dataFile.getName() + " responseCode:" + connection.getResponseHeaders().getHttpResponseCode()); } */ } } } // something went wrong ??? LOG.error("Failed to upload file:" + uploadName + " unknown response code"); return 500; }
From source file:org.commoncrawl.service.listcrawler.ProxyServlet.java
@Override public void doGet(final HttpServletRequest req, final HttpServletResponse response) throws ServletException, IOException { // allocate a response data object ... which will be used by async thread to pass data to calling thread... final AsyncResponse responseData = new AsyncResponse(); String queryString = req.getQueryString(); final String originalPath = req.getParameter("url"); final String format = (req.getParameter("renderAs") != null) ? req.getParameter("renderAs") : PROXY_RENDER_TYPE_NONE;/* w w w. ja v a 2 s .c o m*/ final String timeoutStr = req.getParameter("timeout"); final String skipHTTPGET = req.getParameter("nocachenodice"); final long desiredTimeOutInMS = (timeoutStr != null) ? Long.parseLong(timeoutStr) : 30000; final boolean skipHTTPGet = (skipHTTPGET != null && skipHTTPGET.equals("1")); final Semaphore semaphore = new Semaphore(0); //LOG.info("Got Request:" + originalPath); final long requestStartTime = System.currentTimeMillis(); //LOG.info("Processing Request:" + originalPath); String hostName = (originalPath != null) ? URLUtils.fastGetHostFromURL(originalPath) : ""; String fullPath = null; if (originalPath == null || !originalPath.startsWith("http:") || hostName.length() == 0 || queryString == null) { LOG.info("URL From Proxy Request:" + originalPath + " is Invalid. Sending 400 Result Code"); responseData.setHttpErrorResponse(400, "URL From Proxy Request:" + originalPath + " is Invalid"); } else { // build url path from query string int pathIndex = queryString.indexOf("url="); // grab the whole path ... fullPath = queryString.substring(pathIndex + "url=".length()); // unescape it fullPath = URLDecoder.decode(fullPath, "UTF-8"); //LOG.info("Doing Cache Lookup for URL:" + fullPath); boolean isAsyncOperation = checkCacheForURLV2(fullPath, responseData, semaphore, desiredTimeOutInMS, skipHTTPGet); if (isAsyncOperation) { //LOG.info("Waiting on Async Completion for URL:" + fullPath); semaphore.acquireUninterruptibly(); //LOG.info("Done Waiting for Async Completion for URL:" + fullPath); } } // upon return we need to check the response object ... if (responseData.getResponseType() == AsyncResponse.ResponseType.CacheItemResponse) { // send cache item response ... sendCacheItemResponse(req, response, responseData.getCacheItem(), false, format, responseData, requestStartTime); } else if (responseData.getResponseType() == AsyncResponse.ResponseType.CrawlURLResponse) { sendCrawlURLResponse(req, response, responseData.getCrawlURL(), format, responseData, requestStartTime); } else if (responseData.getResponseType() == AsyncResponse.ResponseType.S3Response) { sendS3ItemResponse(req, response, responseData.getArcFileItem(), format, responseData, requestStartTime); } else { response.sendError(responseData.getHttpErrorCode(), responseData.getHttpErrorDesc()); ProxyServer.getSingleton().logProxyFailure(responseData.getHttpErrorCode(), responseData.getHttpErrorDesc(), fullPath, "", responseData.getStartTime()); } }
From source file:org.commoncrawl.service.parser.client.Dispatcher.java
public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); CrawlEnvironment.setHadoopConfig(conf); String baseURL = "http://unknown.com/"; if (args.length != 0) { baseURL = args[0];/* ww w. j ava 2 s. c om*/ } URL baseURLObj; try { baseURLObj = new URL(baseURL); } catch (MalformedURLException e2) { throw new IOException("Invalid Base Link"); } final URL finalBaseURL = (baseURLObj != null) ? baseURLObj : null; final DataOutputBuffer headerBuffer = new DataOutputBuffer(); final DataOutputBuffer contentBuffer = new DataOutputBuffer(); try { ByteStreams.readBytes(new InputSupplier<InputStream>() { @Override public InputStream getInput() throws IOException { return System.in; } }, new ByteProcessor<Long>() { @Override public Long getResult() { return 0L; } int currLineCharCount = 0; boolean processingHeaders = true; @Override public boolean processBytes(byte[] buf, int start, int length) throws IOException { if (processingHeaders) { int current = start; int end = current + length; while (processingHeaders && current != end) { if (buf[current] != '\r' && buf[current] != '\n') { currLineCharCount++; } else if (buf[current] == '\n') { if (currLineCharCount == 0) { headerBuffer.write(buf, start, current - start + 1); processingHeaders = false; } currLineCharCount = 0; } current++; } if (processingHeaders) { headerBuffer.write(buf, start, length); } else { length -= current - start; start = current; } } if (!processingHeaders) { contentBuffer.write(buf, start, length); } return true; } }); LOG.info("HEADER LEN:" + headerBuffer.getLength()); // System.out.println(new String(headerBuffer.getData(),0,headerBuffer.getLength(),Charset.forName("UTF-8"))); LOG.info("CONTENT LEN:" + contentBuffer.getLength()); //System.out.println(new String(contentBuffer.getData(),0,contentBuffer.getLength(),Charset.forName("UTF-8"))); // decode header bytes ... String header = ""; if (headerBuffer.getLength() != 0) { try { header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(), Charset.forName("UTF-8")); } catch (Exception e) { LOG.warn(CCStringUtils.stringifyException(e)); header = new String(headerBuffer.getData(), 0, headerBuffer.getLength(), Charset.forName("ASCII")); } } final String headersFinal = (header != null) ? header : ""; LOG.info("Starting Event Loop"); final EventLoop eventLoop = new EventLoop(); eventLoop.start(); try { // create fake hosts file ... //String hosts = "10.0.20.101:8072"; // reader //Reader reader = new StringReader(hosts); // dispatcher init LOG.info("initializing Dispatcher"); final Dispatcher dispatcher = new Dispatcher(eventLoop, "parserNodes"); LOG.info("Waiting for a few seconds"); Thread.sleep(5000); Thread threads[] = new Thread[TEST_THREAD_COUNT]; final Semaphore threadWaitSem = new Semaphore(-TEST_THREAD_COUNT - 1); // start 100 threads for (int threadIdx = 0; threadIdx < TEST_THREAD_COUNT; ++threadIdx) { threads[threadIdx] = new Thread(new Runnable() { @Override public void run() { for (int i = 0; i < ITERATIONS_PER_THREAD; ++i) { // build parse request ParseRequest request = new ParseRequest(); request.setDocId(1); request.setDomainId(1); request.setDocURL(finalBaseURL.toString()); request.setDocHeaders(headersFinal); request.setDocContent( new FlexBuffer(contentBuffer.getData(), 0, contentBuffer.getLength())); //LOG.info("Dispatching parse request"); ParseResult result = dispatcher.dispatchRequest(request); LOG.info("TID[" + Thread.currentThread().getId() + "]ReqID[" + i + "]" + " Success:" + ((result != null) ? result.getParseSuccessful() : false) + " LinkCount:" + ((result != null) ? result.getExtractedLinks().size() : 0)); } LOG.info("Thread:" + Thread.currentThread().getId() + " Exiting"); threadWaitSem.release(); } }); threads[threadIdx].start(); } LOG.info("Waiting for threads to die"); threadWaitSem.acquireUninterruptibly(); LOG.info("All Threads dead."); } finally { eventLoop.stop(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); } catch (InterruptedException e) { } }
From source file:org.commoncrawl.service.parser.client.ParserNode.java
public ParseResult dispatchRequest(final ParseRequest request) throws IOException { final AtomicReference<ParseResult> result = new AtomicReference<ParseResult>(); if (_online.get()) { // LOG.info("Dispatching Parse Request for URL:" + request.getDocURL() // + " to Node:" + _nodeName); final Semaphore requestSemaphore = new Semaphore(0); _eventLoop.queueAsyncCallback(new org.commoncrawl.async.Callback() { @Override//from w w w . j av a 2s . c o m public void execute() { try { _asyncStub.parseDocument(request, new Callback<ParseRequest, ParseResult>() { @Override public void requestComplete(AsyncRequest<ParseRequest, ParseResult> request) { try { // LOG.info("Parse Request for URL:" + request.getInput().getDocURL() // + " recvd responseStatus:" + request.getStatus() // + " from Node:" + _nodeName); if (request.getStatus() == Status.Success) { result.set(request.getOutput()); } } finally { // LOG.info("Releasing Request Semaphore for URL:" + request.getInput().getDocURL()); requestSemaphore.release(); } } }); } catch (Exception e) { LOG.error(CCStringUtils.stringifyException(e)); LOG.info("Releasing Request Semaphore for URL:" + request.getDocURL()); requestSemaphore.release(); } } }); // LOG.info("Waiting on ParseReq Semaphore for URL:"+ request.getDocURL()); requestSemaphore.acquireUninterruptibly(); // LOG.info("ParseReq Semaphore signlaed for URL:"+ request.getDocURL()); } return result.get(); }
From source file:org.commoncrawl.service.queryserver.master.S3Helper.java
public static ArcFileItem retrieveArcFileItem(ArchiveInfo archiveInfo, EventLoop eventLoop) throws IOException { // the default bucket id String bucketId = "commoncrawl-crawl-002"; //ok, see if we need to switch buckets if (archiveInfo.getCrawlNumber() == 1) { bucketId = "commoncrawl"; }/*w w w. j a v a 2 s. c o m*/ S3Downloader downloader = new S3Downloader(bucketId, "", "", false); // now activate the segment log ... final Semaphore downloadCompleteSemaphore = new Semaphore(0); final StreamingArcFileReader arcFileReader = new StreamingArcFileReader(false); //arcFileReader.setArcFileHasHeaderItemFlag(false); // create a buffer list we will append incoming content into ... final LinkedList<ByteBuffer> bufferList = new LinkedList<ByteBuffer>(); downloader.initialize(new S3Downloader.Callback() { @Override public boolean contentAvailable(int itemId, String itemKey, NIOBufferList contentBuffer) { LOG.info("ContentQuery contentAvailable called for Item:" + itemKey + " totalBytesAvailable:" + contentBuffer.available()); try { while (contentBuffer.available() != 0) { bufferList.add(contentBuffer.read()); } return true; } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); return false; } } @Override public void downloadComplete(int itemId, String itemKey) { LOG.info("S3 Download Complete for item:" + itemKey); downloadCompleteSemaphore.release(); } @Override public void downloadFailed(int itemId, String itemKey, String errorCode) { LOG.info("S3 Download Failed for item:" + itemKey); downloadCompleteSemaphore.release(); } @Override public boolean downloadStarting(int itemId, String itemKey, int contentLength) { LOG.info("ContentQuery DownloadStarting for Item:" + itemKey + " contentLength:" + contentLength); return true; } }, eventLoop); LOG.info("Starting request for Item:" + hdfsNameToS3ArcFileName(archiveInfo.getArcfileDate(), archiveInfo.getArcfileIndex()) + " Offset:" + archiveInfo.getArcfileOffset()); int sizeToRetrieve = (archiveInfo.getCompressedSize() != 0) ? archiveInfo.getCompressedSize() : 30000; sizeToRetrieve += 10; downloader.fetchPartialItem( hdfsNameToS3ArcFileName(archiveInfo.getArcfileDate(), archiveInfo.getArcfileIndex()), archiveInfo.getArcfileOffset() - 10, sizeToRetrieve); downloadCompleteSemaphore.acquireUninterruptibly(); if (bufferList.size() == 0) { return null; } ByteBuffer firstBuffer = bufferList.getFirst(); if (firstBuffer != null) { int offsetToGZIPHeader = scanForGZIPHeader(firstBuffer.duplicate()); if (offsetToGZIPHeader != -1) { firstBuffer.position(offsetToGZIPHeader); LOG.info("*** Offset to GZIP Header:" + offsetToGZIPHeader); } else { LOG.error("*** Failed to find GZIP Header offset"); } } // now try to decode content if possible for (ByteBuffer buffer : bufferList) { LOG.info("Adding Buffer of Size:" + buffer.remaining() + " Position:" + buffer.position() + " Limit:" + buffer.limit()); arcFileReader.available(buffer); } ArcFileItem item = arcFileReader.getNextItem(); if (item != null) { LOG.info("Request Returned item:" + item.getUri()); LOG.info("Uncompressed Size:" + item.getContent().getCount()); } return item; }
From source file:org.commoncrawl.service.queryserver.query.DomainListQuery.java
@Override protected long executeRemote(final FileSystem fileSystem, final Configuration conf, EventLoop eventLoop, SlaveDatabaseIndex instanceIndex, File tempFirDir, QueryProgressCallback<DomainListQueryInfo, Text, SubDomainMetadata> progressCallback) throws IOException { int shardsProcessed = 0; // ok create a semaphore for the number of shard we are going to query ... final Semaphore semaphore = new Semaphore(-(getCommonQueryInfo().getRelevantShardIds().size() - 1)); // and create a record count array final long recordCounts[] = new long[getCommonQueryInfo().getRelevantShardIds().size()]; final IOException exceptions[] = new IOException[getCommonQueryInfo().getRelevantShardIds().size()]; int threadIdx = 0; // ok dispatch queries for each shard we are responsible for ... for (int shardId : getCommonQueryInfo().getRelevantShardIds()) { final int currentShardId = shardId; final int currentThreadIdx = threadIdx++; Thread subQueryThread = new Thread(new Runnable() { @Override//from w w w . ja va2s. c o m public void run() { Path shardOutputPath = getHDFSQueryResultsFilePathForShard(currentShardId); LOG.info("Execute Remote for Query:" + getQueryId() + " for shardId:" + currentShardId + " Creating spill file @:" + shardOutputPath); try { // create SequenceFile Spill Writer ... SequenceFileSpillWriter<Text, SubDomainMetadata> spillWriter = new SequenceFileSpillWriter<Text, SubDomainMetadata>( fileSystem, conf, shardOutputPath, Text.class, SubDomainMetadata.class, null, true); try { LOG.info("Execute Remote for Query:" + getQueryId() + " calling executeDomainListQuery on index"); // scan index for matching patterns ... spill into writer ... recordCounts[currentThreadIdx] += _slaveDatabaseIndex.queryDomainsGivenPattern( getQueryData().getSearchPattern(), currentShardId, spillWriter); LOG.info("Execute Remote for Query:" + getQueryId() + " executeDomainListQuery returned:" + recordCounts[currentThreadIdx]); } finally { spillWriter.close(); // increment semaphore count semaphore.release(); } } catch (IOException e) { LOG.error("Execute Remote for Query:" + getQueryId() + " executeDomainListQuery failed with error:" + CCStringUtils.stringifyException(e)); exceptions[currentThreadIdx] = e; } } }); subQueryThread.start(); } // ok block until all queries are complete LOG.info("Query:" + getQueryId() + " Waiting on Worker Threads"); semaphore.acquireUninterruptibly(); LOG.info("Query:" + getQueryId() + " All Threads Compelted"); for (IOException e : exceptions) { if (e != null) { LOG.error( "Query:" + getQueryId() + " Failed with Exception:" + CCStringUtils.stringifyException(e)); throw e; } } long cumilativeRecordCount = 0L; for (long recordCount : recordCounts) cumilativeRecordCount += recordCount; return cumilativeRecordCount; }
From source file:org.commoncrawl.util.MapReduceJobStatsWriter.java
/** close and flush the log file **/ public void close(final Callback optionalAsyncCallback) { if (_eventLoop != null) { // allocate a blocking semaphore in case async callback was not specified final Semaphore blockingCallSemaphore = new Semaphore(0); // perform shutdown in worker thread ... _eventLoop.setTimer(new Timer(0, false, new Timer.Callback() { @Override//from ww w .ja va2s . c o m public void timerFired(Timer timer) { try { try { if (_writer != null) { _writer.close(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); _lastLogWriteException = e; } finally { _writer = null; try { if (_outputStream != null) { _outputStream.flush(); _outputStream.close(); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); _lastLogWriteException = e; } finally { _outputStream = null; } } // now figure out if everything went smoothly or not if (_entryCount != 0 && _lastLogWriteException == null) { // ok so far so good... time to copy the local log file to hdfs ... Path hdfsPath = new Path(Environment.HDFS_LOGCOLLECTOR_BASEDIR, _logFamily + "/" + _groupingKey + "/" + Long.toString(_uniqueKey)); try { // delete the remote file if it exists _remoteFileSystem.delete(hdfsPath, false); // ensure parent path _remoteFileSystem.mkdirs(hdfsPath.getParent()); // now if the local file exists and has data if (_tempFileName.exists() && _tempFileName.length() != 0) { // copy the file to hdfs _remoteFileSystem.copyFromLocalFile(new Path(_tempFileName.getAbsolutePath()), hdfsPath); } } catch (IOException e) { LOG.error(CCStringUtils.stringifyException(e)); _lastLogWriteException = e; } } } finally { // always delete the temp file ... _tempFileName.delete(); // release semaphore blockingCallSemaphore.release(); // if callback was specified , call it now if (optionalAsyncCallback != null) { optionalAsyncCallback.execute(); } // stop the event loop ... _eventLoop.stop(); _eventLoop = null; } } })); // now if callback was not specified... wait for blocking semaphore to signal ... if (optionalAsyncCallback == null) { blockingCallSemaphore.acquireUninterruptibly(); } } }