# -*- coding: UTF-8 -*- from pyquery import PyQuery as pq import re from datetime import datetime,timedelta import pymysql import sys reload(sys) sys.setdefaultencoding('utf8') def data_ana(item): return item.text().encode('utf-8').split('(')[0] def community_daily(start_date,date,week,month): filename='email_shequ'+start_date.strftime('%m%d')+'.htm' doc = pq(filename=filename,encoding='utf-8') community_tab=re.findall( r'\d{5,6}',doc('tr:eq(3) td:eq(0)').text())[0] topic_posts_app=doc('tr:eq(24) td:eq(1)') question_posts_app=doc('tr:eq(26) td:eq(1)') share_posts_app=doc('tr:eq(27) td:eq(1)') vote_posts_app=doc('tr:eq(28) td:eq(1)') bycar_posts_app=doc('tr:eq(29) td:eq(1)') posts_entry=doc('tr:eq(37) td:eq(1)') specific_posts_entry=doc('tr:eq(38) td:eq(1)') posts_publish=doc('tr:eq(39) td:eq(1)') list=[community_tab,data_ana(topic_posts_app),data_ana(question_posts_app),data_ana(share_posts_app),data_ana(vote_posts_app),data_ana(bycar_posts_app),data_ana(posts_entry),data_ana(specific_posts_entry),data_ana(posts_publish),week,month,date] print list return list if __name__=='__main__': s_date=sys.argv[1] e_date=sys.argv[2] s_year,s_month,s_day=s_date.split('-') e_year,e_month,e_day=e_date.split('-') start_date=datetime(int(s_year),int(s_month),int(s_day)) end_date =datetime(int(e_year),int(e_month),int(e_day)) community_daily_sql='''**** ''' db_params = {'host':'localhost', 'user':'****', 'passwd':'****', 'db':'****', 'charset':'utf8'} conn = pymysql.connect(**db_params) cursor = conn.cursor() while start_date<end_date: week_start=start_date-timedelta(start_date.weekday()) week_end=week_start+timedelta(days=6) week=week_start.strftime('%m/%d')+'~'+week_end.strftime('%m/%d') month=start_date.strftime('%Y/%m') date=start_date.strftime('%Y-%m-%d') community_daily_list=community_daily(start_date,date,week,month) cursor.execute(community_daily_sql,community_daily_list) start_date=start_date+timedelta(days=1) conn.commit() cursor.close() conn.close()
本站文章如无特殊说明,均为本站原创,如若转载,请注明出处:python爬虫之PyQuery - Python技术站