python爬蟲(以國家煙草網新聞為例)


國家煙草專賣局的網址是:http://www.tobacco.gov.cn/html/
要爬取的內容為各省級局的新聞。
大部分的省的新聞頁url都是有規律的,比如貴州省的是
http://www.tobacco.gov.cn/html/36/3617/361704_i.html 這個i就是頁數。
但有些省的新聞頁url在翻頁后是不變的,比如江西省,從第一頁到最后一頁一直都是http://jx.tobacco.com.cn/nportal/portal/_ns:YVAtMTQ2ZGMzYTk5YzQtMTAwODZ8YzB8ZDB8ZWNob2ljZUlkPTE9MTEwfGVwYWdlTnVtYmVyPTE9NQ__/zwxx/zxdt.psml,這時就需要使用selenuim處理JavaScript實現的翻頁。
具體代碼如下:

# -*- coding: utf-8 -*-
from selenium import webdriver
import pymysql
from pymysql.cursors import DictCursor
from lxml import etree
import requests
import random
import re
import time

driver = webdriver.PhantomJS('G:\Python Extension Packages\phantomjs-2.1.1-windows\\bin\phantomjs.exe')
# driver = webdriver.PhantomJS()
url = ['http://jx.tobacco.com.cn/nportal/portal/zwxx/zxdt.psml']
cls = ["省局信息", "地市信息", "基層信息"]
page = [40, 133, 57]
db_params = dict(
host="localhost",
db="chinatobacco",
user="root",
passwd="123456",
charset="utf8",
cursorclass=DictCursor,
use_unicode=True
)
connect = pymysql.connect(**db_params)
cursor = connect.cursor()

USER_AGENTS = [
"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
"Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
"Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
"Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
"Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
"Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
"Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
"Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0",
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36",
"Mozilla/5.0 (iPhone; CPU iPhone OS 7_1_2 like Mac OS X) App leWebKit/537.51.2 (KHTML, like Gecko) Version/7.0 Mobile/11D257 Safari/9537.53",
"Mozilla/5.0 (Windows; U; Windows NT 5.2) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13",
"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; QIHU 360EE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1) ; Maxthon/3.0)",
"'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Mozilla/5.0 (Macintosh; U; IntelMac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1Safari/534.50",
"Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0",
]



def parse_and_save(href, count, cls):
try:
link = href
headers = {'user-agent': random.choice(USER_AGENTS)}
text = requests.get(href, headers=headers)
text.encoding = "utf-8"
tree = etree.HTML(text.text)
title = tree.xpath('//div[@class="article-content-t"]/text()')
if title:
title = title[0].split()[0]
# print(title)
date = tree.xpath('//div[@class="article-content-ban"]/span/text()')
# print(date)
date = date[0].split(":")[-1]
# print(date)
ps = tree.xpath('//div[@class="content-text"]/p/text()')
content = ''.join(p for p in ps)
# print(content)
sql = "insert into jiangxi(title, link, cls, date, content) values (%s, %s, %s, %s, %s)"
params = (title, link, cls, date, content)
# print(params)
cursor.execute(sql, params)
connect.commit()
count += 1
except:
pass
return count


if __name__ == "__main__":

content_count = 0
hrefs = []
# for url in urls[:]:
# tab = tabs[n]
# n += 1
# driver.get(url)
# # 獲取該欄目總頁數
# num = driver.find_element_by_xpath('//*[@class="page_show"]/span/font[2]').text
driver.get(url[0])
driver.find_element_by_link_text('省局信息').click()
time.sleep(2)
page_num=page[0]
# text = requests.get(url[0])
# time.sleep(0.5)
# text.encoding = "GBK"
# tree = etree.HTML(text.text)
# divs = tree.xpath('//*[@id="div110"]/div/ul/form/div[1]/li/p/a')
divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
# print(divs)
for div in divs:
lis = div.find_elements_by_xpath('p/a')
# print(lis)
for li in lis:
href = li.get_attribute('href')
# print(href)
content_count = parse_and_save(href=href, count=content_count, cls=cls[0])
print(cls[0], "第", 1, "頁")
for i in range(page_num - 1):
# print(driver.page_source)
driver.find_element_by_link_text('下一頁').click()
time.sleep(2)
divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
for div in divs:
lis = div.find_elements_by_xpath('p/a')
for li in lis:
href = li.get_attribute('href')
content_count = parse_and_save(href=href, count=content_count, cls=cls[0])
print(cls[0], "第", i + 2, "頁")
# lis = driver.find_elements_by_xpath('//*[@id="second_main_right"]/dl/dd/ul/li')
# content_count = parse_and_save(lis=lis, count=content_count)
print("爬取%d篇文章" % content_count)


driver.get(url[0])
driver.find_element_by_link_text('地市信息').click()
time.sleep(2)
page_num = page[1]
divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
for div in divs:
lis = div.find_elements_by_xpath('p/a')
for li in lis:
href = li.get_attribute('href')
content_count = parse_and_save(href=href, count=content_count, cls=cls[1])
print(cls[1], "第", 1, "頁")
for i in range(page_num - 1):
# print(driver.page_source)
driver.find_element_by_link_text('下一頁').click()
time.sleep(2)
divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
for div in divs:
lis = div.find_elements_by_xpath('p/a')
for li in lis:
href = li.get_attribute('href')
content_count = parse_and_save(href=href, count=content_count, cls=cls[1])
print(cls[1], "第", i + 2, "頁")
# lis = driver.find_elements_by_xpath('//*[@id="second_main_right"]/dl/dd/ul/li')
# content_count = parse_and_save(lis=lis, count=content_count)
print("爬取%d篇文章" % content_count)

driver.get(url[0])
driver.find_element_by_link_text('基層信息').click()
time.sleep(2)
page_num = page[2]

# lis = driver.find_elements_by_xpath('//*[@id="second_main_right"]/dl/dd/ul/li')
divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
for div in divs:
lis = div.find_elements_by_xpath('p/a')
for li in lis:
href = li.get_attribute('href')
content_count = parse_and_save(href=href, count=content_count, cls=cls[2])
print(cls[2], "第", 1, "頁")
for i in range(page_num-1):
# print(driver.page_source)
driver.find_element_by_link_text('下一頁').click()
time.sleep(2)
divs = driver.find_elements_by_xpath('//div[@class="new-ul-li5"]//li')
for div in divs:
lis = div.find_elements_by_xpath('p/a')
for li in lis:
href = li.get_attribute('href')
content_count = parse_and_save(href=href, count=content_count, cls=cls[2])
print(cls[2], "第", i+2, "頁")
# lis = driver.find_elements_by_xpath('//*[@id="second_main_right"]/dl/dd/ul/li')
# content_count = parse_and_save(lis=lis, count=content_count)
print("爬取%d篇文章" % content_count)
driver.quit()

遇到的問題:
1 頁面不能右鍵
解決:shift+f10
2 頁面里的xpath和程序里爬的不一定對應
程序里要使用[@class]這種形式
3 單步調試 加斷點 debug
4 異常處理
5 pymysql.err.InternalError: (1241, ‘Operand should contain 1 column(s)’)
原因是頁面源代碼中的標題分成了兩部分,因此爬下來的title分為兩部分 以列表的形式保存了,所以存不進去
6 requests.exceptions.ConnectionError: (‘Connection aborted.’, ConnectionResetError(10054, ‘遠程主機強迫關閉了一個現有的連接。’, None, 10054, None))
目前的解決方案是sleep時間長一點然后報錯了就從上次爬的頁數往后重新運行,不知道有沒有什么好的方法
7 注意沒有.html的網頁不能亂加
8 對於http://www.hntobacco.gov.cn/export/sites/mysite/gongzuodongtai/yancaoyaowen/###這種翻頁其實沒有翻頁,2000多條內容全在一頁里,會爬着爬着自動停掉,不知道為什么,每次停的地方還不一樣。


注意!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系我们删除。



 
粤ICP备14056181号  © 2014-2021 ITdaan.com