`
cpszgy
  • 浏览: 22374 次
  • 性别: Icon_minigender_1
  • 来自: 合肥
社区版块
存档分类
最新评论

用HttpClient访问sina微博

 
阅读更多
     今天接到任务要去爬sina微博内容。爬虫爬得有cookie,所以要用httpclient写个登录接口。
用到的分析工具:Live HTTP Headers(火狐的一个插件)
参考资源:http://blog.csdn.net/yodlove/article/details/5938022
               http://blog.csdn.net/jk_yu520/article/details/6622661
用到单点登陆真是蛋疼。而且还跳来跳去,不得不跟着一步一步来。
修改:
       昨天没看细节,发现cookie有问题。导致登陆是成功的,但是使用cookie的时候出问题,只好继续埋头BAIDU,发现一个虽然目前无法登陆但还是可以借鉴的源代码.
http://www.lupaworld.com/code.php?mod=list&itemid=26&path=weibobackup_1.1/src/cn/jayslong/weibo&file=Login.java
     主要代码修改:增加了
   
 DefaultHttpParams.getDefaultParams().setBooleanParameter(
				   HttpMethodParams.SINGLE_COOKIE_HEADER, true);   

     废除了mergeCookie( )方法。
代码
package com.cp.http;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.httpclient.Cookie;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpState;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;

import com.cp.http.Sh1;

/**
 * 
 * 
 * @author chengping
 * 
 */
public class TestClient {

	static final String LOGON_SITE = "login.sina.com.cn";
	static final int LOGON_PORT = 80;
	static final String preLoginUrl = "http://login.sina.com.cn/sso/prelogin.php?entry=miniblog&callback=sinaSSOController.preloginCallBack&client=ssologin.js(v1.3.14)&_=1313560817097";
	static final String loginurl = "/sso/login.php?client=ssologin.js(v1.3.14)";

	static final String defaultUser = "xxxxx";
	static final String defaultPasswordOld = "xxxxxx";

	static final String homePageUrl = "http://weibo.com/";

	public static void main(String[] args) throws Exception {
		HttpClient client = new HttpClient();
                client.getParams().setParameter(   
				        HttpMethodParams.HTTP_CONTENT_CHARSET, "UTF8");  
                //原先采用自己合并COOKIE,但是出现问题,用此方法OK
		DefaultHttpParams.getDefaultParams().setBooleanParameter(
				   HttpMethodParams.SINGLE_COOKIE_HEADER, true); 
		client.getParams().setParameter(HttpMethodParams.USER_AGENT,"Mozilla/5.0 (X11; U; Linux i686; zh-CN; rv:1.9.1.2) Gecko/20090803 Fedora/3.5.2-2.fc11 Firefox/3.5.2"); 
		client.getHostConfiguration().setHost(LOGON_SITE, LOGON_PORT);
		String ajaxLoginUrl=Login(client, preLogin(client));
		String uniqueid=ajaxLogin(client,ajaxLoginUrl);
		getHomePage(client,homePageUrl+uniqueid);
		
	}
	/**
	 * 获取用户的主页
	 * @param client
	 * @param homePageUrl
	 * @throws IOException
	 */
	public static void getHomePage(HttpClient client,String homePageUrl) throws IOException {
		// TODO Auto-generated method stub
		GetMethod get = new GetMethod(homePageUrl);
		try {
			client.executeMethod(get);
		} catch (HttpException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		String response = get.getResponseBodyAsString();
		System.out.println(response);
	}
	/**
	 * preLogin获得servertime和一个定长随机的字符串nonce
	 * 用户账号采用默认账号
	 * @param client
	 * @return	
	 * @throws Exception
	 */
	public  static Map<String, String> preLogin(HttpClient client)
			throws Exception {
		return preLogin(client, defaultUser);
	}
	/**
	 * preLogin获得servertime和一个定长随机的字符串nonce
	 * 用户账号采用输入账号
	 * @param client
	 * @param userEmail
	 * @return
	 * @throws Exception
	 */
	public static Map<String, String> preLogin(HttpClient client,
			String userEmail) throws Exception {
		GetMethod get = new GetMethod(preLoginUrl + "&usr=" + userEmail);
		try {
			client.executeMethod(get);
		} catch (HttpException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		String response = get.getResponseBodyAsString();

		// System.out.println(response);
		// System.out.println(get.getFollowRedirects());
		// System.out.println(get.getPath());
		// for (Header h : get.getResponseHeaders()) {
		// System.out.print(h);
		// }
		get.releaseConnection();
		printCookie(client);
		mergeCookie(client);
		printCookie(client);
		System.out.println("-------------preLogin结束--------------");
		return responseBodyToMap(response);
	}
	/**
	 * prelogin后拿到2个参数用来生成加密的密码,正式登陆。
	 * 账户,密码是系统默认
	 * @param client
	 * @param serverTimeAndNonce
	 * @return
	 * @throws IOException
	 * @throws HttpException
	 */
	public static String Login(HttpClient client,
			Map<String, String> serverTimeAndNonce) throws IOException,
			HttpException {
		return Login(client, serverTimeAndNonce, defaultUser,
				defaultPasswordOld);
	}

	/**
	 * prelogin后拿到2个参数用来生成加密的密码,正式登陆。
	 * 账户,密码采用输入参数
	 * @param client
	 * @param serverTimeAndNonce
	 * @param userEmail
	 * @param passwordOld
	 * @return
	 * @throws IOException
	 * @throws HttpException
	 */
	public static String Login(HttpClient client,
			Map<String, String> serverTimeAndNonce, String userEmail,
			String passwordOld) throws IOException, HttpException {
		PostMethod post = new PostMethod(loginurl);
		// 初始化POST方法的content
		NameValuePair entry = new NameValuePair("entry", "miniblog");
		NameValuePair gateway = new NameValuePair("gateway", "1");
		NameValuePair from = new NameValuePair("from", "");
		NameValuePair savestate = new NameValuePair("savestate", "7");
		NameValuePair useticket = new NameValuePair("useticket", "1");
		NameValuePair ssosimplelogin = new NameValuePair("ssosimplelogin", "1");
		NameValuePair username = new NameValuePair("username", userEmail);
		NameValuePair service = new NameValuePair("service", "miniblog");
		NameValuePair servertime = new NameValuePair("servertime",
				serverTimeAndNonce.get("servertime"));
		NameValuePair nonce = new NameValuePair("nonce", serverTimeAndNonce
				.get("nonce"));
		NameValuePair pwencode = new NameValuePair("pwencode", "wsse");
		NameValuePair password = new NameValuePair("password", encryption(
				passwordOld, serverTimeAndNonce));
		NameValuePair encoding = new NameValuePair("encoding", "utf-8");
		NameValuePair url = new NameValuePair(
				"url",
				"http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack");
		NameValuePair returntype = new NameValuePair("returntype", "META");
		post.addParameters(new NameValuePair[] { entry, gateway, from,
				savestate, useticket, ssosimplelogin, username, service,
				servertime, nonce, pwencode, password, encoding, url,
				returntype });

		int status = client.executeMethod(post);
		String responseBodyAsString = post.getResponseBodyAsString();
		
	/*	System.out.println(status);
		
		System.out.println(response);
		System.out.println(post.getFollowRedirects());
		System.out.println(post.getPath());
		for (Header h : post.getResponseHeaders()) {
			System.out.print(h);
		}*/
		//printCookie(client);
		//mergeCookie(client);
		//printCookie(client);
		System.out.println("-------------Login结束--------------");
		post.releaseConnection();
		
		return getAjaxUrl(responseBodyAsString);
	}
	/**
	 * 加密密码
	 * @param passwordOld
	 * @param serverTimeAndNonce
	 * @return
	 */
	private static String encryption(String passwordOld,
			Map<String, String> serverTimeAndNonce) {
		// TODO Auto-generated method stub
		String tempPassword = Sh1.testDigest(Sh1.testDigest(passwordOld));
		tempPassword += serverTimeAndNonce.get("servertime");
		tempPassword += serverTimeAndNonce.get("nonce");
		return Sh1.testDigest(tempPassword);
	}
	/**
	 * login后密码验证成功后跳入ajaxLogin,获得唯一标示码。
	 * @param client
	 * @param ajaxUrl
	 * @return
	 * @throws IOException
	 * @throws HttpException
	 */
	public static String ajaxLogin(HttpClient client, String ajaxUrl)
			throws IOException, HttpException {
		GetMethod getByAjax = new GetMethod(ajaxUrl);
		client.executeMethod(getByAjax);
		System.out.println(getByAjax.getURI());
		//printCookie(client);
		//mergeCookie(client);
		//printCookie(client);
		System.out.println("-------------ajaxLogin结束--------------");
		//System.out.println(getByAjax.getResponseBodyAsString());
	    String 	responseBodyAsString=getByAjax.getResponseBodyAsString();
		getByAjax.releaseConnection();
		
		return getUniqueid(responseBodyAsString);
	}
	/**
	 * 从AjaxLogin返回的信息中获得唯一标识码
	 * @param responseBodyAsString
	 * @return
	 */
	private static String getUniqueid(String responseBodyAsString) {
		// TODO Auto-generated method stub
		int start=responseBodyAsString.indexOf("uniqueid")+11;
		int end=responseBodyAsString.indexOf("userid")-3;
		//System.out.println(responseBodyAsString);
		//System.out.println(responseBodyAsString.indexOf("uniqueid"));
		//System.out.println(responseBodyAsString.indexOf("userid"));
		//System.out.println(responseBodyAsString.substring(start, end));
		return responseBodyAsString.substring(start, end);
	}
	/**
	 * 将preLogin返回信息提取需要的二个参数转为MAP形式
	 * @param response
	 * @return
	 */
	private static Map<String, String> responseBodyToMap(String response) {
		Map<String, String> result = new HashMap<String, String>();
		int start = response.indexOf("{") + 1;
		int end = response.indexOf("}");
		String body = response.substring(start, end);
		String[] strings = body.split(",");
		for (int i = 1; i < 3; i++) {
			String[] elements = strings[i].split(":");
			result.put(elements[0].replace("\"", ""), elements[1].replace("\"",
					""));
		}

		return result;
	}
	/**
	 * 构建阶段时输出cookie值
	 * @param client
	 */
	private static void  printCookie(HttpClient client){
		Cookie[] cookies = client.getState().getCookies();
		System.out.println("目前有"+cookies.length+"条cookie");
		int index =0;
		for(Cookie cookie:cookies){
			System.out.println("cookie["+index+"]:{"+cookie.getName()+","+cookie.getValue()+"}");
			index++;
		}
		
	}
	/**
	 * 合并cookie信息放到一个cookie中
         * 此方法会出错,无效。
	 * @param client
	 */
	private static void mergeCookie(HttpClient client){
		Cookie[] cookies = client.getState().getCookies();
		if (cookies != null && cookies.length > 0) {
			String cook = cookies[0].getValue();
			for (int i = 1; i < cookies.length; i++) {
				cook += "; " + cookies[i].getName() + "="
						+ cookies[i].getValue();
			}
			cookies[0].setValue(cook);
			HttpState state = new HttpState();
			state.addCookie(cookies[0]);
			client.setState(state);
		}
	}
	/**
	 * 从login方法后返回的信息中获得下一步AjaxLogin的URL
	 * @param responseBodyAsString
	 * @return
	 */
	private static String getAjaxUrl(String responseBodyAsString){
		int start=responseBodyAsString.indexOf("replace")+9;
		int end=responseBodyAsString.indexOf("</script>")-6;
		//System.out.println(responsBodyAsString);
		//System.out.println(responsBodyAsString.indexOf("</script>"));
		//System.out.println(responsBodyAsString.substring(start, end));
		return responseBodyAsString.substring(start, end);
	}
}

下面是密码加密的辅助类,sina采用的加密是密码二次SHA1加密后得到的字符串拼接上preLogin得到的二个参数servertime(服务器时间),nonce(随机生成的字符串)。再用SHA加密。(我靠,sina他不累啊。)
package com.cp.http;

public class Sh1 {
	  public static String testDigest(String info)
	  {
	   try {

	    //java.security.MessageDigest alg=java.security.MessageDigest.getInstance("MD5");
	      java.security.MessageDigest alga=java.security.MessageDigest.getInstance("SHA-1");
	      alga.update(info.getBytes());
	      byte[] digesta=alga.digest();
	     // System.out.println("本信息摘要是:"+byte2hex(digesta));
	      //通过某中方式传给其他人你的信息(myinfo)和摘要(digesta) 对方可以判断是否更改或传输正常
	      return  byte2hex(digesta);
	   }
	   catch (java.security.NoSuchAlgorithmException ex) {
	     System.out.println("非法摘要算法");
	     return "erro";
	   }

	  }
	  public static String byte2hex(byte[] b) //二行制转字符串
	    {
	     String hs="";
	     String stmp="";
	     for (int n=0;n<b.length;n++)
	      {
	       stmp=(Integer.toHexString(b[n] & 0XFF));
	       if (stmp.length()==1) hs=hs+"0"+stmp;
	       else hs=hs+stmp;
	      // if (n<b.length-1)  hs=hs+":";
	      }
	     return hs;
	    }
}
分享到:
评论
5 楼 cpszgy 2012-04-19  
Sweblish 写道
复制这段代码,感觉有点问题,没有跑通

sina有部分改动。
4 楼 Sweblish 2012-04-15  
复制这段代码,感觉有点问题,没有跑通
3 楼 cpszgy 2012-04-06  
chanllen 写道
为啥会出现这个问题呢?
http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack&retcode=101&reason=%B5%C7%C2%BC%C3%FB%BB%F2%C3%DC%C2%EB%B4%ED%CE%F3
-------------ajaxLogin结束--------------
Exception in thread "main" java.lang.StringIndexOutOfBoundsException: String index out of range: -14

at java.lang.String.substring(String.java:1932)
at com.cp.http.TestClient.getUniqueid(TestClient.java:240)
at com.cp.http.TestClient.ajaxLogin(TestClient.java:225)
at com.cp.http.TestClient.main(TestClient.java:47)

可能是sina又改了唯一吗的方式。我这个是去年的。你自己用Live HTTP Headers去一步步跟着看看。
2 楼 chanllen 2012-04-02  
为啥会出现这个问题呢?
http://weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack&retcode=101&reason=%B5%C7%C2%BC%C3%FB%BB%F2%C3%DC%C2%EB%B4%ED%CE%F3
-------------ajaxLogin结束--------------
Exception in thread "main" java.lang.StringIndexOutOfBoundsException: String index out of range: -14

at java.lang.String.substring(String.java:1932)
at com.cp.http.TestClient.getUniqueid(TestClient.java:240)
at com.cp.http.TestClient.ajaxLogin(TestClient.java:225)
at com.cp.http.TestClient.main(TestClient.java:47)
1 楼 endual 2012-03-30  
牛的,
     HttpClient client = new HttpClient();  应该是HttpClient3.X的版本的,
老师说6月份也要爬新浪的数据,现在摸索着弄下。幸好这样的文章,要不自己动手
真不知道要弄到什么时候

相关推荐

Global site tag (gtag.js) - Google Analytics