利用Python scapy爬取起點小說網小說


  土木狗,大二,混吃等死不知老之將至。



items.py

import scrapy


class QidianItem(scrapy.Item):

    title = scrapy.Field()
    content = scrapy.Field()
pipelines.py
import os
class QidianPipeline(object):
     # def __init__(self):
     #

     def process_item(self, item, spider):
     #根據書名來創建文件,item.get('title')就可以獲取到書名
       os.chdir(path=r'/home/administrator/PycharmProjects/untitled/qidian/qidian/en')
       with open(file=str(item.get('title'))+".txt",mode='a') as f:
           f.write(item.get('content'))
       return item

settings.py
LOG_LEVEL= 'ERROR'

LOG_FILE ='log.txt'

爬蟲文件
import scrapy
# from scrapy.linkextractors import LinkExtractor
# from scrapy.spiders import CrawlSpider , Rule
from ..items import QidianItem
class XiaoshuoSpider(scrapy.Spider):
    name = 'xiaoshuo'
    allowed_domains = ['qidian.com']
    start_urls = [
        "https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page="+str(page) for page in range(100,200)
    ]
    def parse(self, response):
        #獲取每本書的url,並且將鏈逐個交給下一個方法處理
        for url_ in response.xpath("//h4/a/@href").extract():
            yield scrapy.Request('https:'+str(url_),callback=self.parse_info)
    def parse_info(self,response):
        item = QidianItem()
        #獲取小說標題
        title = response.xpath("//h1/em/text()").extract()[0]
        self.title_ = title
        print("開始爬取小說:"+title)
        #作為文件名
        item['title'] = title
        #獲取免費閱讀鏈接,並且交由parse__content 方法處理
        info_url = response.xpath("//a[@id='readBtn']/@href").extract()[0]
        yield scrapy.Request("https:"+str(info_url),meta={'item':item},callback=self.parse_content)
        #獲取說章節,並且進行遞歸,重復獲取
    def parse_content(self,response):
        #獲取章節標題
        name = response.xpath("//h3[@class='j_chapterName']/text()").extract()[0]+"\n"
        print("正在爬取小說******《"+self.title_+"》******章節:---------------"+name)
        content = name +''
        for str_ in response.xpath("//div[@class='read-content j_readContent']//p/text()").extract():
            content = content +str_[1:]
        #獲取上面傳過來的item
        item = response.meta['item']
        #將書的章節及章節內容存入content字段中
        item['content'] = content
        next_url = 'https:' + response.xpath("//a[@id='j_chapterNext']/@href").extract()[0]
        #過濾收費章節,
        count = 0
        if len(content) > 250:
            try:
               yield item
               yield scrapy.Request(str(next_url),meta={'item':item},callback=self.parse_content)
            except:
                print("---------------------------------------------------------------------------")
        else: print("免費章節已經完啦!")






注意!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系我们删除。



 
粤ICP备14056181号  © 2014-2021 ITdaan.com