import datetime
from urllib import parse
import scrapy
import logging
from tutorial_spider.items import TakeFirstItemLoader, TtxsgotoItem
logger = logging.getLogger(__name__)
class Ttxsgoto01Spider(scrapy.Spider):
name = 'ttxsgoto01'
allowed_domains = ['ttxsgoto.github.io']
start_urls = ['http://ttxsgoto.github.io/']
custom_settings = {
"ITEM_PIPELINES": {
'tutorial_spider.pipelines.TtxsgotoFilterPipeline': 10,
'tutorial_spider.pipelines.TtxsgotoBlogMysqlchemyPipeline': 20,
},
}
def parse(self, response):
articles = response.css('#main .post')
for article in articles:
article_url = article.css('h1 a::attr(href)').extract_first()
url = parse.urljoin(response.url, article_url)
yield scrapy.Request(url, callback=self.parse_article)
next_url = response.css('#page-nav a[rel="next"][href]').css('::attr(href)').extract_first()
if next_url:
yield scrapy.Request(url=parse.urljoin(response.url, next_url), callback=self.parse)
def parse_article(self, response):
"""解析文章详情"""
item_loader = TakeFirstItemLoader(item=TtxsgotoItem(), selector=response)
item_loader.add_css('title', '#main header a::text')
item_loader.add_value('url', response.url)
item_loader.add_css('publish', '.article-time time::text')
item_loader.add_css('content', '.article-content')
item_loader.add_css('classify', '.article-tags a::text')
item_loader.add_css('lable', '.article-categories a::text')
item_loader.add_value('create_time', datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
item = item_loader.load_item()
yield item