土木狗,大二,混吃等死不知老之將至。
items.py
import scrapy class QidianItem(scrapy.Item): title = scrapy.Field() content = scrapy.Field()
import os class QidianPipeline(object): # def __init__(self): # def process_item(self, item, spider): #根據書名來創建文件,item.get('title')就可以獲取到書名 os.chdir(path=r'/home/administrator/PycharmProjects/untitled/qidian/qidian/en') with open(file=str(item.get('title'))+".txt",mode='a') as f: f.write(item.get('content')) return item
LOG_LEVEL= 'ERROR' LOG_FILE ='log.txt'
import scrapy # from scrapy.linkextractors import LinkExtractor # from scrapy.spiders import CrawlSpider , Rule from ..items import QidianItem class XiaoshuoSpider(scrapy.Spider): name = 'xiaoshuo' allowed_domains = ['qidian.com'] start_urls = [ "https://www.qidian.com/all?orderId=&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=0&page="+str(page) for page in range(100,200) ] def parse(self, response): #獲取每本書的url,並且將鏈逐個交給下一個方法處理 for url_ in response.xpath("//h4/a/@href").extract(): yield scrapy.Request('https:'+str(url_),callback=self.parse_info) def parse_info(self,response): item = QidianItem() #獲取小說標題 title = response.xpath("//h1/em/text()").extract()[0] self.title_ = title print("開始爬取小說:"+title) #作為文件名 item['title'] = title #獲取免費閱讀鏈接,並且交由parse__content 方法處理 info_url = response.xpath("//a[@id='readBtn']/@href").extract()[0] yield scrapy.Request("https:"+str(info_url),meta={'item':item},callback=self.parse_content) #獲取說章節,並且進行遞歸,重復獲取 def parse_content(self,response): #獲取章節標題 name = response.xpath("//h3[@class='j_chapterName']/text()").extract()[0]+"\n" print("正在爬取小說******《"+self.title_+"》******章節:---------------"+name) content = name +'' for str_ in response.xpath("//div[@class='read-content j_readContent']//p/text()").extract(): content = content +str_[1:] #獲取上面傳過來的item item = response.meta['item'] #將書的章節及章節內容存入content字段中 item['content'] = content next_url = 'https:' + response.xpath("//a[@id='j_chapterNext']/@href").extract()[0] #過濾收費章節, count = 0 if len(content) > 250: try: yield item yield scrapy.Request(str(next_url),meta={'item':item},callback=self.parse_content) except: print("---------------------------------------------------------------------------") else: print("免費章節已經完啦!")
本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系我们删除。