com.waku.mmdataextract.ComprehensiveSearch.java Source code

Java tutorial

Introduction

Here is the source code for com.waku.mmdataextract.ComprehensiveSearch.java

Source

/*
 * ComprehensiveSearch2.java
 * Created on 2011-5-25; Project to Colt2010; $Id: ComprehensiveSearch.java 309 2013-04-25 16:38:44Z tristan $
 * 
 * Copyright (c) 2011, Xu Brothers and/or its affiliates. All rights reserved.
 * Xu Brothers PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
 */

package com.waku.mmdataextract;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;

import org.apache.http.entity.mime.MultipartEntity;
import org.apache.http.entity.mime.content.StringBody;
import org.apache.log4j.Logger;
import org.dom4j.Document;
import org.dom4j.Element;

import com.waku.common.http.MyHttpClient;

/**
 * @versin $Rev: 309 $, $Date: 2013-04-26 00:38:44 +0800 (, 26  2013) $
 * @author Jin
 */
public class ComprehensiveSearch {

    static Logger logger = Logger.getLogger(ComprehensiveSearch.class.getName());

    private static final String START_ACTION = "http://shouji.gd.chinamobile.com/gdmobile/displaySearch.do?flag=searchForm&imgType=1";
    private final static String SEARCH_ACTION = "http://shouji.gd.chinamobile.com/gdmobile/search.do?pageNo=005";

    private static List<String> prodIdList = new ArrayList<String>();

    @SuppressWarnings("deprecation")
    private static MultipartEntity getMultipartEntity(String brandId, int pageNumber) {
        MultipartEntity reqEntity = new MultipartEntity();
        try {
            reqEntity.addPart("flag", new StringBody("search"));
            reqEntity.addPart("brandId", new StringBody(brandId));
            reqEntity.addPart("currentPage", new StringBody(pageNumber + ""));
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        return reqEntity;
    }

    @SuppressWarnings("unchecked")
    public static void main(String[] args) {
        FileWriter fw = null;
        try {
            fw = new FileWriter(new File("output/ComprehensiveSearch.csv"));
            fw.write(
                    ",??,?,?,,?,??,,?,1,2,3,\n");
        } catch (IOException e) {
            e.printStackTrace();
        }
        Document firstPage = MyHttpClient.getAsDom4jDoc(START_ACTION);
        // System.out.println(doc.asXML());
        List<Element> brandOptions = firstPage.selectNodes("//select[@name='brandId']/option");
        for (Element brandOption : brandOptions) {
            String brandId = brandOption.attributeValue("value");
            if (!brandId.equalsIgnoreCase("0")) {
                for (int i = 1; true; i++) {
                    logger.info("Get brandId/page -> " + brandId + "/" + i);
                    if (searchDone(fw, brandId, i)) {
                        break;
                    }
                }
            }
        }
        try {
            fw.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        logger.info("----> Done!");

        logger.info("----> Start to compare production search ... ");
        CompareProductions.start(prodIdList, 0);
    }

    @SuppressWarnings("unchecked")
    private static boolean searchDone(FileWriter fw, String brandId, int i) {
        Document resultPage = MyHttpClient.getAsDom4jDoc(SEARCH_ACTION, getMultipartEntity(brandId, i));
        List<Element> products = resultPage.selectNodes("//tr[@onmouseout]");
        logger.info("Get products count -> " + products.size());
        for (Element product : products) {
            List<Element> items = product.elements();
            // Remove last col
            items.remove(items.size() - 1);
            Element firstItem = items.get(0);
            String attributeValue = firstItem.attributeValue("onclick");
            String productId = attributeValue.substring(attributeValue.indexOf("('") + 2,
                    attributeValue.indexOf("')"));
            if (prodIdList.contains(productId)) {
                logger.info("Get product id duplicated -> " + productId);
                continue;
            } else {
                logger.info("Get product id add -> " + productId);
                prodIdList.add(productId);
                StringBuilder sb = new StringBuilder();

                // Save image here
                String toFileName = productId + ".gif";
                saveImage(firstItem.element("img").attributeValue("src"), toFileName);
                sb.append(toFileName + ",");

                items.remove(0); // remove first one
                for (Element item : items) {
                    sb.append(item.getText() + ",");
                }
                logger.info(sb.toString());
                sb.append("\n");
                try {
                    fw.write(sb.toString());
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
        }
        if (products.size() < 20)
            return true;
        else
            return false;
    }

    public static void saveImage(String imgSrc, String toFileName) {
        String toFile = "output/images/" + toFileName;
        if (new File(toFile).exists()) {
            logger.info("File already saved ->" + toFile);
            return;
        }
        URL u = null;
        URLConnection uc = null;
        InputStream raw = null;
        InputStream in = null;
        FileOutputStream out = null;
        try {
            int endIndex = imgSrc.lastIndexOf("/") + 1;
            String encodeFileName = URLEncoder.encode(imgSrc.substring(endIndex), "UTF-8").replaceAll("[+]", "%20");
            u = new URL("http://shouji.gd.chinamobile.com" + imgSrc.substring(0, endIndex) + encodeFileName);
            uc = u.openConnection();
            String contentType = uc.getContentType();
            int contentLength = uc.getContentLength();
            if (contentType.startsWith("text/") || contentLength == -1) {
                logger.error("This is not a binary file. -> " + imgSrc);
            }
            raw = uc.getInputStream();
            in = new BufferedInputStream(raw);
            byte[] data = new byte[contentLength];
            int bytesRead = 0;
            int offset = 0;
            while (offset < contentLength) {
                bytesRead = in.read(data, offset, data.length - offset);
                if (bytesRead == -1)
                    break;
                offset += bytesRead;
            }
            if (offset != contentLength) {
                logger.error("Only read " + offset + " bytes; Expected " + contentLength + " bytes");
            }
            out = new FileOutputStream(toFile);
            out.write(data);
            out.flush();
            logger.info("Saved file " + u.toString() + " to " + toFile);
        } catch (Exception e) {
            e.printStackTrace();
        } finally {
            try {
                in.close();
            } catch (Exception e) {
            }
            try {
                out.close();
            } catch (Exception e) {
            }
        }
    }

}