c#爬虫爬取京东的商品信息
爬取京东商品信息需要实现以下步骤:
- 获取京东的商品列表页面
- 从列表页面中提取所有商品的链接
- 访问每个商品链接获取商品详情页面
- 从商品详情页面中提取商品信息
- 将商品信息保存到本地或者数据库,可以使用CSV或者Excel格式保存
1. 获取京东的商品列表页面
string url = "https://list.jd.com/list.html?cat=1713,3267,3394";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
string html = string.Empty;
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
using (StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("GBK")))
{
html = reader.ReadToEnd();
}
}
以上代码使用GET方式获取京东的商品列表页面,返回结果保存在html
变量中。
2. 从列表页面中提取所有商品的链接
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(html);
HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//div[@class='p-name']/a");
foreach (HtmlNode node in nodes)
{
string href = node.Attributes["href"].Value;
string url = "https:" + href;
Console.WriteLine(url);
}
以上代码使用HtmlAgilityPack解析HTML页面,提取所有商品链接,并将链接打印出来。
3. 访问每个商品链接获取商品详情页面
string url = "https://item.jd.com/100006411924.html";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
string html = string.Empty;
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
using (StreamReader reader = new StreamReader(response.GetResponseStream()))
{
html = reader.ReadToEnd();
}
}
以上代码使用GET方式访问每个商品链接,返回结果保存在html
变量中。
4. 从商品详情页面中提取商品信息
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(html);
string title = document.DocumentNode.SelectSingleNode("//div[@class='sku-name']").InnerText;
string price = document.DocumentNode.SelectSingleNode("//span[@class='price J-p-"+productId+"']");
Console.WriteLine("商品名称:" + title);
Console.WriteLine("商品价格:" + price);
以上代码使用HtmlAgilityPack解析商品详情页面,提取商品名称和价格,并将结果打印出来。
5. 保存商品信息到本地或者数据库
using (StreamWriter writer = new StreamWriter("result.csv", true, Encoding.UTF8))
{
writer.WriteLine("\"" + title + "\",\"" + price + "\"");
}
以上代码使用StreamWriter将商品名称和价格保存到CSV文件中,格式为"商品名称","商品价格"。
SqlConnection conn = new SqlConnection("Data Source=(local);Initial Catalog=mydb;Integrated Security=True");
string sql = "INSERT INTO products (title, price) VALUES (@title, @price)";
using (SqlCommand cmd = new SqlCommand(sql, conn))
{
cmd.Parameters.AddWithValue("@title", title);
cmd.Parameters.AddWithValue("@price", price);
conn.Open();
cmd.ExecuteNonQuery();
conn.Close();
}
以上代码使用SqlConnection将商品名称和价格保存到SQL Server数据库的products表中。
示例
下面是一个简单的程序,演示如何使用以上代码获取京东商品信息。
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Net;
using System.Text;
using HtmlAgilityPack;
using System.Data.SqlClient;
namespace JDProductCrawler
{
class Program
{
static void Main(string[] args)
{
string url = "https://list.jd.com/list.html?cat=1713,3267,3394";
HttpWebRequest request = (HttpWebRequest)WebRequest.Create(url);
request.Method = "GET";
string html = string.Empty;
using (HttpWebResponse response = (HttpWebResponse)request.GetResponse())
{
using (StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("GBK")))
{
html = reader.ReadToEnd();
}
}
HtmlAgilityPack.HtmlDocument document = new HtmlAgilityPack.HtmlDocument();
document.LoadHtml(html);
HtmlNodeCollection nodes = document.DocumentNode.SelectNodes("//div[@class='p-name']/a");
foreach (HtmlNode node in nodes)
{
string href = node.Attributes["href"].Value;
string url2 = "https:" + href;
HttpWebRequest request2 = (HttpWebRequest)WebRequest.Create(url2);
request2.Method = "GET";
string html2 = string.Empty;
using (HttpWebResponse response = (HttpWebResponse)request2.GetResponse())
{
using (StreamReader reader = new StreamReader(response.GetResponseStream()))
{
html2 = reader.ReadToEnd();
}
}
HtmlAgilityPack.HtmlDocument document2 = new HtmlAgilityPack.HtmlDocument();
document2.LoadHtml(html2);
string title = document2.DocumentNode.SelectSingleNode("//div[@class='sku-name']")
.InnerHtml.Trim();
string price = document2.DocumentNode.SelectSingleNode("//span[@class='price J-p-100006411924']")
?.InnerHtml.Trim();
Console.WriteLine("商品名称:" + title);
Console.WriteLine("商品价格:" + price);
using (StreamWriter writer = new StreamWriter("result.csv", true, Encoding.UTF8))
{
writer.WriteLine("\"" + title + "\",\"" + price + "\"");
}
SqlConnection conn = new SqlConnection("Data Source=(local);Initial Catalog=mydb;Integrated Security=True");
string sql = "INSERT INTO products (title, price) VALUES (@title, @price)";
using (SqlCommand cmd = new SqlCommand(sql, conn))
{
cmd.Parameters.AddWithValue("@title", title);
cmd.Parameters.AddWithValue("@price", price ?? "0");
conn.Open();
cmd.ExecuteNonQuery();
conn.Close();
}
}
}
}
}
以上程序会自动获取京东液晶屏幕的商品信息,并保存到CSV文件和SQL Server数据库中。
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:c#爬虫爬取京东的商品信息 - Python技术站