C# 基于Titanium爬取微信公众号历史文章列表攻略
1. 准备工作
1.1 安装Titanium
Titanium是一款爬虫框架,需要使用C#编写,因此在开始之前需要确保您已经安装了Titanium。您可以在Titanium官网下载最新版的Titanium,并根据其安装说明进行安装。
1.2 获取微信公众号的cookie
我们需要使用微信公众号的cookie来进行爬取。您可以使用您的微信账号登录公众号并手动保存cookie,也可以使用模拟登录获取cookie。以下为使用模拟登录获取cookie的示例代码:
using System;
using System.Net;
using System.Text;
using System.Web;
using System.IO;
namespace WeChatCrawler
{
public class WeChatLogin
{
private static string wechatAccount = "your_wechat_account";
private static string wechatPassword = "your_wechat_password";
private static CookieContainer cookieContainer = new CookieContainer();
public static void Login()
{
string loginUrl = "https://mp.weixin.qq.com/cgi-bin/bizlogin?action=startlogin";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(loginUrl);
request.Method = "POST";
request.ContentType = "application/x-www-form-urlencoded";
request.CookieContainer = cookieContainer;
string postData = string.Format("username={0}&pwd={1}&imgcode=&f=json&userlang=zh_CN&redirect_url=&token=&lang=zh_CN", wechatAccount, wechatPassword);
byte[] byteArray = Encoding.UTF8.GetBytes(postData);
request.ContentLength = byteArray.Length;
Stream loginStream = request.GetRequestStream();
loginStream.Write(byteArray, 0, byteArray.Length);
loginStream.Close();
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
response.Cookies = cookieContainer.GetCookies(response.ResponseUri);
response.Close();
Console.WriteLine("微信公众号登录成功!");
}
public static CookieContainer GetCookieContainer()
{
return cookieContainer;
}
}
}
2. 爬取历史文章列表
2.1 获取公众号的__biz参数
在开始爬取微信公众号的历史文章列表之前,您需要先获取公众号的__biz参数。您可以通过浏览器的开发者工具,或者通过抓包工具获取。以下为使用模拟登录获取__biz参数的示例代码:
private static string GetBiz()
{
string bizUrl = "https://mp.weixin.qq.com/cgi-bin/home?t=home/index&lang=zh_CN&token=&lang=zh_CN";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(bizUrl);
request.Method = "GET";
request.CookieContainer = WeChatLogin.GetCookieContainer();
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream responseStream = response.GetResponseStream();
StreamReader streamReader = new StreamReader(responseStream, Encoding.UTF8);
string responseData = streamReader.ReadToEnd();
streamReader.Close();
string bizPattern = @"<script id=""microbar"" type=""text/html"">([\s\S]*?)</script>";
Match bizMatch = Regex.Match(responseData, bizPattern);
string bizData = HttpUtility.HtmlDecode(bizMatch.Groups[1].Value);
string biz = "";
biz = Regex.Match(bizData, @"<a href=""/cgi-bin/menu/menu_manage\?t=menu/index&action=menu_edit&[\s\S]*?&biz=(?<biz>.*?)&[\s\S]*?"">").Groups["biz"].Value;
Console.WriteLine("公众号biz参数获取成功:{0}", biz);
return biz;
}
2.2 构建请求URL和POST数据
在获取__biz参数之后,您需要构建请求URL和POST数据来获取历史文章列表。以下为请求URL和POST数据的示例代码:
private static string BuildUrl(string biz, int offset, int count)
{
string baseUrl = "https://mp.weixin.qq.com/cgi-bin/appmsg?action=list_ex";
string postData = string.Format("type=9&query=&is_only_read=1&is_include_edu=1&is_include_old=1&is_include_ad=1&begin={0}&count={1}&fakeid={2}&token=&lang=zh_CN", offset, count, biz);
return baseUrl + "&" + postData;
}
2.3 发送请求并解析返回结果
最后,您需要发送请求并解析返回结果来获取历史文章列表。以下为发送请求并解析返回结果的示例代码:
private static List<Article> GetArticles(string biz)
{
List<Article> articles = new List<Article>();
int offset = 0;
int count = 10;
int maxCount = 100;
while (count <= maxCount)
{
string getUrl = BuildUrl(biz, offset, count);
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(getUrl);
request.Method = "GET";
request.CookieContainer = WeChatLogin.GetCookieContainer();
HttpWebResponse response = (HttpWebResponse)request.GetResponse();
Stream responseStream = response.GetResponseStream();
StreamReader streamReader = new StreamReader(responseStream, Encoding.UTF8);
string responseData = streamReader.ReadToEnd();
streamReader.Close();
JObject jsonResult = JObject.Parse(responseData);
JToken articleList = jsonResult["app_msg_list"];
foreach (var articleItem in articleList)
{
Article article = new Article();
article.Title = articleItem["title"].ToString();
article.Url = articleItem["link"].ToString();
article.PubTime = long.Parse(articleItem["create_time"].ToString());
articles.Add(article);
}
if (maxCount == count || articleList.Count() <= 0)
{
break;
}
offset += count;
count = maxCount - offset;
if (count > 20)
{
count = 20;
}
}
return articles;
}
3. 示例说明
3.1 示例1:获取公众号为“网易”(biz为“MjM5NTMxMTQ2MA==”)的历史文章列表
static void Main(string[] args)
{
WeChatLogin.Login();
string biz = "MjM5NTMxMTQ2MA==";
List<Article> articles = GetArticles(biz);
Console.WriteLine("网易公众号历史文章列表:");
foreach (var article in articles)
{
Console.WriteLine("{0}({1}):{2}", article.Title, TimestampToDateTime(article.PubTime), article.Url);
}
Console.ReadLine();
}
3.2 示例2:获取公众号为“人民网”(biz为“MjM5NTY5MTE0MA==”)的历史文章列表
static void Main(string[] args)
{
WeChatLogin.Login();
string biz = "MjM5NTY5MTE0MA==";
List<Article> articles = GetArticles(biz);
Console.WriteLine("人民网公众号历史文章列表:");
foreach (var article in articles)
{
Console.WriteLine("{0}({1}):{2}", article.Title, TimestampToDateTime(article.PubTime), article.Url);
}
Console.ReadLine();
}
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:c# 基于Titanium爬取微信公众号历史文章列表 - Python技术站