需求:简书网站整站爬虫。数据保存到mysql数据库中。将seleniume+chromedriver集成到scrapy
爬取结果如下:
安装Selenium和chromedriver:
项目准备
开启一个有模板的scrapy项目,在这里有scrapy经验的朋友应该都比较熟练了。进入到创建好的虚拟环境当中运行以下shell代码。
scrapy startproject [projectname]
cd projectname
scrapy genspider -t crawl spidername ["spiderdomain"]
完成之后就可以打开开发工具(pycharm),运行项目
分析简书页面
项目的目的是爬取简书的所有文章,所以从简书首页url着手。然后通过f12抓包查看文章url的基本格式可以观察到,打开一篇文章的url为https://www.jianshu.com/p/d97946dfdcef,而在首页显示的url为项目url模板格式可以用正则匹配全站文章的url,匹配站内url将follow设为true。
rules = ( Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'), callback='parse_detail', follow=True), )
获取所需数据
之前网上的代码是
根据新页面修改,发现出现问题
说明语法没问题打开链接也确实是我们需要的图片
打开源代码进行搜索关键字,发现原因
说明这数据可能是从ajax中获取到的都在json文件中
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from jianshu.items import JianshuItem
import json
class JsSpiderSpider(CrawlSpider):
name = 'js_spider'
allowed_domains = ['jianshu.com']
start_urls = ['https://www.jianshu.com/']
rules = (
Rule(LinkExtractor(allow=r'.*/p[0-9a-z]{12}.*'), callback='parse_detail', follow=True),
)
def parse_detail(self, response):
script = response.xpath('//script[@id="__NEXT_DATA__"]/text()').get()
process_url=response.url.split('?')[0]# 以问号分割取前一部分
article_id=process_url.split('/')[-1] # 以 ‘/’ 分割获取最后一个字符串即为文章的id
origin_url=response.url
data = json.loads(script) # 传进来的数据转换成字典的格式
# 开始解析data
props = data.get('props')
if props:
initialState = props.get('initialState')
if initialState:
note = initialState.get('note')
if note:
data_ = note.get('data')
if data_:
title = data_.get('public_title')
content = data_.get('free_content')
author = data_.get('user').get('nickname') # 防止出错的话,可以使用if判断一下
letterNumber = data_.get('wordage')
likes_count=data_.get("likes_count")
views_count=data_.get('views_count')
#first_shared_at = data_.get('first_shared_at') # 发布时间
publish_time= data_.get('last_updated_at') # 最近更改时间
item = JianshuItem(title=title, content=content,author=author,letterNumber=letterNumber, article_id=article_id,
publish_time=publish_time,likes_count=likes_count,views_count=views_count,
origin_url=origin_url)
yield item
在item中定义字段
class JianshuItem(scrapy.Item):
author= scrapy.Field()
title=scrapy.Field()
content=scrapy.Field()
#avatar=scrapy.Field()
publish_time =scrapy.Field()
letterNumber=scrapy.Field()
article_id=scrapy.Field()
likes_count= scrapy.Field()
views_count= scrapy.Field()
origin_url=scrapy.Field()
数据库设计
注意id要选自动递增
定义下载中间件
class SeleniumDownloadMiddleware(object):
def __init__(self):
self.driver = webdriver.Chrome()
def process_request(self, request, spider):
self.driver.get(request.url)
time.sleep(2)
try:
while True:
showMore = self.driver.find_element_by_class_name('show-more') # 获取标签
showMore.click()
time.sleep(0.5)
if not showMore:
break
except:
pass
source = self.driver.page_source
response = HtmlResponse(url=self.driver.current_url, body=source, request=request, encoding='utf-8')
return response
pipelines.py文件
这里把数据存储到数据库中。同步和异步两种方式
import pymysql
from twisted.enterprise import adbapi # 使用异步数据库处理连接池
from pymysql import cursors # 数据库游标类
class JianshuPipeline(object):
def __init__(self):
dbparams = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': 'root',
'database': 'jianshu',
'charset': 'utf8'
}
self.conn = pymysql.connect(**dbparams)
self.cursor = self.conn.cursor()
self._sql = None
def process_item(self, item, spider):
self.cursor.execute(self.sql, (item['title'], item['content'],
item['author'], item['letterNumber'],
item['article_id'],
item['publish_time'],
item['likes_count'], item['views_count'],
item['origin_url']))
self.conn.commit()
return item
@property
def sql(self):
if not self._sql:
self._sql = """
insert into article(id,title,content,author,letterNumber,
article_id,publish_time,likes_count,views_count,origin_url) values (null,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
return self._sql
return self._sql
class JianshuTwistedPipeline(object):
def __init__(self):
params = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': 'root',
'database': 'jianshu',
'charset': 'utf8',
'cursorclass': cursors.DictCursor
}
# 调用异步连接池实现异步插入数据库
self.dbpool = adbapi.ConnectionPool("pymysql", **params)
self._sql = None
@property
def sql(self):
if not self._sql:
self._sql = '''insert into article(id,title,content,author,letterNumber,
article_id,publish_time,origin_url,likes_count,views_count) values (null,%s,%s,%s,%s,%s,%s,%s,%s,%s)'''
return self._sql
return self._sql
def process_item(self, item, spider):
# 异步插入数据
defer = self.dbpool.runInteraction(self.insert_item, item)
# 错误处理
defer.addErrback(self.handle_error, item, spider)
def insert_item(self, item, cursor):
cursor.execute(self.sql, (item['title'], item['content'],
item['author'], item['letterNumber'],
item['article_id'],
item['publish_time'],
item['likes_count'], item['views_count'],
item['origin_url']))
def handle_error(self, item, error, spider):
print('+' * 30 + 'error' + '+' * 30)
print(error)
print('+' * 30 + 'error' + '+' * 30)
setting.py
为了使上面定义的中间件起作用,必须在setting中开启中间件
设置user-agent
关闭robot协议
设置合理地下载延迟,否则会被服务器禁用 ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 3
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36'
}
DOWNLOADER_MIDDLEWARES = {
'jianshu.middlewares.SeleniumDownloadMiddleware': 543,
}
ITEM_PIPELINES = {
'jianshu.pipelines.JianshuPipeline': 300,
}