之前看了很多获取网页源码的写法,要么有乱码,要么没考虑到gzip等压缩,比如有些网页这个HttpWebResponse 对象的CharacterSet是iso-8859-1,那么这种情况 我们根据它的charset读取。 还有个将流转成byte[] 数组的。。。发现GetResponseStream() 的流不能获取length属性 以下是全部源码 - /// <summary>
- /// 将 Stream 转成 byte[]
- /// </summary>
- private static byte[] StreamToBytes(Stream stream)
- {
- MemoryStream stmMemory = new MemoryStream();
- byte[] buffer = new byte[4096];
- int i;
- while ((i = stream.Read(buffer, 0, buffer.Length)) > 0)
- {
- stmMemory.Write(buffer, 0, i);
- }
- byte[] arraryByte = stmMemory.ToArray();
- stmMemory.Close();
-
-
- // 设置当前流的位置为流的开始
- if (stream.CanSeek)
- {
- stream.Seek(0, SeekOrigin.Begin);
- }
- return arraryByte;
- }
- /**
- * 用getBytes(encoding):返回字符串的一个byte数组
- * 当b[0]为 63时,应该是转码错误
- * A、不乱码的汉字字符串:
- * 1、encoding用GB2312时,每byte是负数;
- * 2、encoding用ISO8859_1时,b[i]全是63。
- * B、乱码的汉字字符串:
- * 1、encoding用ISO8859_1时,每byte也是负数;
- * 2、encoding用GB2312时,b[i]大部分是63。
- * C、英文字符串
- * 1、encoding用ISO8859_1和GB2312时,每byte都大于0;
- * <p/>
- * 总结:给定一个字符串,用getBytes("iso8859_1")
- * 1、如果b[i]有63,不用转码; A-2
- * 2、如果b[i]全大于0,那么为英文字符串,不用转码; B-1
- * 3、如果b[i]有小于0的,那么已经乱码,要转码。 C-1
- */
- /// <summary>
- /// //url是要访问的网站地址,charSet是目标网页的编码,如果传入的是null或者"",那就自动分析网页的编码
- /// </summary>
- /// <param name="url"></param>
- /// <returns></returns>
- public static string DoGet(string url, string charSet = null, string aspnetSessionID = null)
- {
- HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
- if (!String.IsNullOrEmpty(aspnetSessionID))
- {
- CookieContainer cookies = new CookieContainer();
- req.CookieContainer = cookies;
- Cookie appCookie = new Cookie("ASP.NET_SessionId", aspnetSessionID);//注意ASP.NET验证sessionID的名:ASP.NET_SessionId
- req.CookieContainer.Add(new Uri(url), appCookie);
- }
- // 需要注意的:
- //有的网页可能下不下来,有种种原因比如需要cookie,编码问题等等
- //这是就要具体问题具体分析比如在头部加入cookie
- // req.Headers.Add("Cookie", cookie);
- //这样可能需要一些重载方法。根据需要写就可以了
-
- //获取或设置用于对向 Internet 资源的请求进行身份验证的网络凭据。
- req.Credentials = CredentialCache.DefaultCredentials;
- //如果服务器要验证用户名,密码
- //NetworkCredential mycred = new NetworkCredential(struser, strpassword);
- //myWebClient.Credentials = mycred;
- //从资源下载数据并返回字节数组。(加@是因为网址中间有"/"符号)
-
- req.Method = "GET";
- req.ContentType = "application/x-www-form-urlencoded";
- HttpWebResponse res = (HttpWebResponse)req.GetResponse();
- Stream receiveStream = res.GetResponseStream();
- if (res.ContentEncoding.ToLower().Contains("gzip"))
- {
- receiveStream = new GZipStream(receiveStream, CompressionMode.Decompress);
- }
- else
- if (res.ContentEncoding.ToLower().Contains("deflate"))
- {
- receiveStream = new DeflateStream(receiveStream, CompressionMode.Decompress);
- }
-
- //获取网页字符编码描述信息
- //Match charSetMatch = Regex.Match(strWebData, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
- //string webCharSet = charSetMatch.Groups[2].Value.Trim('"');
- byte[] buffer = StreamToBytes(receiveStream);
- Encoding encode = Encoding.UTF8;
- if (String.IsNullOrEmpty(charSet))
- {
- if (res.CharacterSet != null && res.CharacterSet != "" && (res.CharacterSet.ToLower() == "gbk" || res.CharacterSet.ToLower() == "gb2312"))
- {
- encode = Encoding.GetEncoding("gb2312");
- }
- }
- else
- {
- encode = Encoding.GetEncoding(charSet);
- }
-
- string result = encode.GetString(buffer);
-
- if (res.CharacterSet.ToLower() == "iso-8859-1")
- {
- Match charSetMatch = Regex.Match(result, "<meta([^<]*)charset=([^<]*)"", RegexOptions.IgnoreCase | RegexOptions.Multiline);
- if (charSetMatch.Success)
- {
- string webCharSet = charSetMatch.Groups[2].Value.Trim('"');
- if (Encoding.GetEncoding(webCharSet) != encode)
- {
- result = Encoding.GetEncoding(webCharSet).GetString(buffer);
- }
- }
- }
-
- receiveStream.Close();
- buffer = null;
- return result;
- }
复制代码 [size=1em]
|