一个简单的爬虫

在做数据爬虫的时候,我们会面临以下几个问题:

1, 页面数据获取(有时需要传入cookie)

2, IP被封,请求失败

3, 请求失败如何重试

针对这几个问题,我写了一个小的爬虫例子。

1, 请求数据:

以某官网数据为例:

根据查询条件输入数据,进行查询,打开chrome的控制台,找出返回数据的请求,可以看到head中需要传入cookie,请求参数为一个json格式字符串

HMW2BI{GB`]5G_IL}PW{{T6

1

 

下面我们使用JAVA中httpClient包发送请求:

以下在head中添加相关信息,请求参数封装为一个StringEntity传入,模拟http请求即可获取请求数据

private String catchTicket(String depCode,String arrCode,String flightDate) throws Exception {
    String param = "json格式参数";
    String cookie = "cookie数据";
    
    DefaultHttpClient httpclient = new DefaultHttpClient();
    HttpResponse response = null;
    String responseString = null;
    HttpPost httpost = null;
    IProxy iproxy = null;
    try {
        httpclient = new DefaultHttpClient();
        HttpParams params = httpclient.getParams();
        HttpConnectionParams.setConnectionTimeout(params, 50*1000);
        HttpConnectionParams.setSoTimeout(params, 120*1000);
        
        /* 设置代理 */
        iproxy = HttpProxy.getProxy();
        HttpHost httphost = new HttpHost(iproxy.getIp(), iproxy.getPort());    
        httpclient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, httphost); 
        
        httpost = new HttpPost(POST_URL);
        httpost.addHeader("Accept", "application/json, text/javascript, */*; q=0.01");
        httpost.addHeader("Accept-Language", "zh-CN,zh;q=0.8");
        httpost.addHeader("Connection", "keep-alive");
        httpost.addHeader("Content-Type","application/json; charset=UTF-8");
        httpost.addHeader("Cookie", cookie);
        httpost.addHeader("Host", "www.united.com");
        httpost.addHeader("UserAgent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36");
        httpost.addHeader("X-Requested-With", "XMLHttpRequest");
        //httpost.addHeader("Accept-Encoding", "gzip, deflate");
        
        StringEntity parEntity = new StringEntity(param);
        parEntity.setContentType("application/json; charset=utf-8");
        httpost.setEntity(parEntity);
        
        response = httpclient.execute(httpost); // 执行
        responseString = EntityUtils.toString(response.getEntity(),"UTF-8");
        if (response.getStatusLine().getStatusCode() != 200 && response.getStatusLine().getStatusCode() != 404) {
            logger.info("response code error({}) throw exception",response.getStatusLine().getStatusCode());
            throw new Exception();
        }
    } catch (Exception e) {
        e.printStackTrace();
        HttpProxy.removeProxy(iproxy);
        throw e;
    } finally {
        httpost.abort();
        httpost = null;
        httpclient.getConnectionManager().shutdown();
        httpclient = null;
        
    }

    // 休眠1秒
    TimeUnit.SECONDS.sleep(1);
    return responseString;
}

可以看到以上代码使用了代理,每次请求都会从HttpProxy获取一个代理,当此请求出现错误,标识代理异常,将此代理IP从HttpProxy中移除

代理IP我是在kuaidaili网站上抓取的,HttpProxy类如下:

/**
 * 代理类
 * chuan.zhang
 */
public class HttpProxy {
    
    private static final Logger logger = LoggerFactory.getLogger(HttpProxy.class);
    private final static String POST_URL = "http://www.kuaidaili.com/proxylist/1/";
    private static List<IProxy> iproxys = new ArrayList<IProxy>();
    
    public static void main(String[] args) throws Exception {
        System.out.println(HttpProxy.getProxy());
    }
    
    /**
     * 随机生成一个代理
     * @return
     * @throws Exception
     */
    public static IProxy getProxy() throws Exception {
        if (iproxys.size() == 0) {
            initProxys();
            logger.info("init proxy over");
        }
        
        Random rand = new Random();
        int num = rand.nextInt(iproxys.size()-1);
        return iproxys.get(num);
    }
    
    public static void removeProxy(IProxy iproxy) {
        if (iproxy != null) {
            iproxys.remove(iproxy);
            logger.info("send request error remove the iproxy: "+iproxy.getIp()+":"+iproxy.getPort());
        }
    }

    /**
     * 初始化代理
     * 从http://www.kuaidaili.com/获取最新代理
     */
    private static List<IProxy> initProxys() throws Exception {
        DefaultHttpClient httpclient = new DefaultHttpClient();
        HttpResponse response = null;
        String responseString = null;
        HttpGet httget = null;
        
        try {
            httpclient = new DefaultHttpClient();
            HttpParams params = httpclient.getParams();
            HttpConnectionParams.setConnectionTimeout(params, 50*1000);
            HttpConnectionParams.setSoTimeout(params, 120*1000);
            
            httget = new HttpGet(POST_URL);
            httget.addHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
            httget.addHeader("Accept-Language", "zh-CN,zh;q=0.8");
            httget.addHeader("Connection", "keep-alive");
            httget.addHeader("cookie", "channelid=0; sid=1470121255086558; _gat=1; _ga=GA1.2.2135905250.1469704395; Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1469704395,1469781681,1470121266; Hm_lpvt_7ed65b1cc4b810e9fd37959c9bb51b31=1470121847");
            httget.addHeader("Content-Type","application/json; charset=UTF-8");
            httget.addHeader("Host", "www.kuaidaili.com");
            httget.addHeader("Referer", "http://www.kuaidaili.com/proxylist/2/");
            httget.addHeader("UserAgent", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36");
            
            response = httpclient.execute(httget); // 执行
            responseString = EntityUtils.toString(response.getEntity(),"UTF-8");
            
            Pattern p = Pattern.compile("<td data-title=\"IP\">([\\s\\S]*?)</td>[\\s\\S]*?<td data-title=\"PORT\">([\\s\\S]*?)</td>",
                    Pattern.DOTALL);
            Matcher m = p.matcher(responseString);
            IProxy iproxy = new IProxy();
            while (m.find()) {
                iproxy.setIp(m.group(1));
                iproxy.setPort(Integer.parseInt(m.group(2)));
                iproxys.add(iproxy);
            }
            
        } catch (Exception e) {
            e.printStackTrace();
            logger.error("init proxy error");
            throw e;
        } finally {
            httget.abort();
            httget = null;
            httpclient.getConnectionManager().shutdown();
            httpclient = null;
        }
        
        return iproxys;
    }
    
}

当代理池中没有IP,则从代理网站中抓取一批代理,我这里只抓取了10个,当然事前抓取更多IP保存下来

最后说说访问失败重试机制:

public void execute() {
    while(true) {
        try {
            Map<String, Object> routeMap = this.dao.getRoute();
            logger.info("start catch {}", routeMap);
            String depCode = routeMap.get("dep_code").toString();
            String arrCode = routeMap.get("arr_code").toString();
            String flightDate = routeMap.get("catch_date").toString();
            JSONObject json = null;
            try {
                String result = this.catchTicket(depCode,arrCode,flightDate);
                json = JSONObject.parseObject(result);
            } catch (Exception e) {
                logger.info("catch error result: "+routeMap);
                this.retryCatch(routeMap,5);
                continue;
            }
        
            this.parseDataToDb(json);
        } catch (Exception e) {
            e.printStackTrace();
	    try {
                TimeUnit.MINUTES.sleep(30);
            } catch (InterruptedException e1) {
                e1.printStackTrace();
            }
        }
    }
    
}

当请求数据出现异常,则会调用this.retryCatch(routeMap,5);方法,前面的一个参数为请求数据,后面的一个参数为重试次数,

如下:retryCatach方法实质是开启了一个线程,防止堵塞原来的逻辑,此线程会重新调用请求方法获取数据,如果请求成功,则停止请求,若请求失败,则会请求指定重试次数后停止,并在错误信息记录到数据库

/**
 * 重新抓取
 * @param routeMap
 * @param count
 */
private void retryCatch(final Map<String, Object> routeMap,final int count) {
    new Thread(new Runnable() {
        
        @Override
        public void run() {
            String depCode = routeMap.get("dep_code").toString();
            String arrCode = routeMap.get("arr_code").toString();
            String flightDate = routeMap.get("catch_date").toString();
            JSONObject json = null;
            for (int i=0;i<count;i++) {
                logger.info("retry catch ("+i+") {}", routeMap);
                try {
                    String result = catchTicket(depCode,arrCode,flightDate);
                    json = JSONObject.parseObject(result);
                } catch (Exception e) {
                    logger.info("retry catch ("+i+") error result: "+routeMap);
                    if (i == count-1) {
                        dao.updateRoute(routeMap.get("id").toString(), flightDate);
                    }
                    continue;
                }
                break;
            }
            
            parseDataToDb(json);
        }
    }).start();
}

以上一个基本的数据爬虫基本完成。