系列文章目录
第一章:html" class="superseo">selenium实现爬虫功能selenium爬取图片实例 第二章:selenium实现增量式爬虫功能增量式爬虫 第三章:selenium搜索关键字爬虫
前言
自上次爬取完了4K美女的图片之后,发现动漫栏里边也有很多好看的的美女壁纸,但是如果直接爬取整个动漫栏则会出现很多多余的图片,所以这次采用了搜索关键字进行爬取。
一、源代码
声名一下由于chrome的高级版本宣布不支持selenium所以博主将浏览器换成了Firefox。
import requests
from selenium import webdriver
import os
import pymysql
def hide():
options = webdriver.FirefoxOptions()
options.add_argument('-headless')
driver = webdriver.Firefox(options=options)
return driver
def Gethtml(url):
driver=hide()
driver.get(url)
s = driver.find_elements_by_css_selector("div[class='slist'] li a")
if str(s[-1].get_attribute("href")).split("/")[-2] == "4kmeinv":
geturl(s[:-1])
else:
geturl(s)
print(s[-1].get_attribute("href"))
Gethtml(s[-1].get_attribute("href"))
def huoqvpicture(url):
driver = webdriver.Chrome(options=hide())
driver.get(url)
s=driver.find_element_by_css_selector("div[class='photo-pic'] a img")
print(s.get_attribute("title"))
insert(url,s.get_attribute("src"),s.get_attribute("title"))
GetPicture(str(s.get_attribute("src")),str(s.get_attribute("title")))
def GetPicture(url,name):
root = "../dist/"
path =root + name.replace(" ","")+".jpg"
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url)
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")
def geturl(s):
for i in s:
print(i.get_attribute("href"))
if not qvchong(i.get_attribute("href")):
huoqvpicture(str(i.get_attribute("href")))
def insert(html,jpg,name):
con = pymysql.connect(host="121.196.244.215", user="root", password="123456" ,port=3306 ,db="tupian",charset="utf8")
cur=con.cursor()
html=str(html)
jpg=str(jpg)
name=str(name)
sql="insert into suoyin(html,jpg,name) values('"+html+"','"+jpg+"','"+name+"');"
print("sql")
#cur.execute(sql)
con.commit()
def qvchong(i):
con = pymysql.connect(host="121.196.244.215", user="root", password="123456", port=3306, db="tupian",
charset="utf8")
cur=con.cursor()
sql="select html from suoyin"
cur.execute(sql)
results = cur.fetchall()
i=(str(i),)
if i in results:
print("数据已存在")
return True
else:
return False
def main():
url="https://pic.netbian.com/4kmeinv/index.html"
Gethtml(url)
main()
二、修改后
import requests
from selenium import webdriver
import os
import pymysql
import time
def hide():
options = webdriver.FirefoxOptions()
options.add_argument('-headless')
driver = webdriver.Firefox(options=options)
return driver
def huoqvpicture(url):
driver = hide()
driver.get(url)
s=driver.find_element_by_css_selector("div[class='photo-pic'] a img")
print(s.get_attribute("title"))
insert(url,s.get_attribute("src"),s.get_attribute("title"))
GetPicture(str(s.get_attribute("src")),str(s.get_attribute("title")))
driver.close()
def GetPicture(url,name):
root = "../dist/"
path =root + name.replace(" ","")+".jpg"
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(url)
with open(path, 'wb') as f:
f.write(r.content)
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")
def geturl(s):
for i in s:
print(i.get_attribute("href"))
if not qvchong(i.get_attribute("href")):
huoqvpicture(str(i.get_attribute("href")))
def insert(html,jpg,name):
con = pymysql.connect(host="121.196.244.215", user="root", password="123456" ,port=3306 ,db="tupian",charset="utf8")
cur=con.cursor()
html=str(html)
jpg=str(jpg)
name=qvdian(str(name))
sql="insert into suoyin(html,jpg,name) values('"+html+"','"+jpg+"','"+name+"');"
print("插入一条数据")
cur.execute(sql)
con.commit()
def qvchong(i):
con = pymysql.connect(host="121.196.244.215", user="root", password="123456", port=3306, db="tupian",
charset="utf8")
cur=con.cursor()
sql="select html from suoyin"
cur.execute(sql)
results = cur.fetchall()
i=(str(i),)
if i in results:
print("数据已存在")
return True
else:
return False
def geisuo(driver,a):
s = driver.find_elements_by_css_selector("div[class='slist'] li a")
print(a)
a = a + 1
if str(s[-1].get_attribute("href")).split("/")[-2] == "result":
geturl(s[:-1])
else:
geturl(s)
bt=driver.find_element_by_class_name("nextpage")
bt.click()
geisuo(driver,a)
def click(url):
driver = hide()
driver.implicitly_wait(3)
driver.get(url)
keyboard = driver.find_element_by_name("keyboard")
time.sleep(1)
keyboard.send_keys("美女")
bt = driver.find_element_by_name("submit")
time.sleep(1)
bt.click()
geisuo(driver,1)
def qvdian(s):
s=str(s)
ls=s.split("'")
s="".join(ls)
return s
def main():
url="https://pic.netbian.com/e/search/result/index.php?page=1&searchid=16"
click(url)
main()
三、代码分析
def hide():
options = webdriver.FirefoxOptions()
options.add_argument('-headless')
driver = webdriver.Firefox(options=options)
return driver
对更换浏览器后对无头设置进行了简单的修改,现在是直接返回一个浏览器,提高了代码运行效率。
def click(url):
driver = hide() #创建无头浏览器
driver.implicitly_wait(3) #设计浏览器等待时间
driver.get(url) #获取html页面
keyboard = driver.find_element_by_name("keyboard")
#找到搜索框
time.sleep(1) #等待1秒(拟人化操作)
keyboard.send_keys("美女")
#搜索框输入相应的关键字
bt = driver.find_element_by_name("submit")
#找到搜索按钮
time.sleep(1) #等待1秒(拟人化操作)
bt.click() #点击搜索按钮
geisuo(driver,1) #将浏览器和页数作为参数传递
这段代码可以说是整个新增代码中的核心代码,他负责进行关键字搜索。
def geisuo(driver,a):
s = driver.find_elements_by_css_selector("div[class='slist'] li a") #获取相关的url(最后一个是下一页)
print(a) #打印当前页页数,方便人了解进度
a = a + 1 #页数加一
if str(s[-1].get_attribute("href")).split("/")[-2] == "result":
#查看是否到达最后一页
geturl(s[:-1])
#除最后一个进行爬取(未到达最后一页)
else:
geturl(s)
#全部爬取(到达最后一页)
bt=driver.find_element_by_class_name("nextpage")
#找到下一页的点击按钮,进行翻页
bt.click()
#点击下一页
geisuo(driver,a)
#调用自身重新获取数据
之前翻页都是靠重新获取url,此次采用了鼠标点击。
def huoqvpicture(url):
driver = hide()
driver.get(url)
s=driver.find_element_by_css_selector("div[class='photo-pic'] a img")
print(s.get_attribute("title"))
insert(url,s.get_attribute("src"),s.get_attribute("title"))
GetPicture(str(s.get_attribute("src")),str(s.get_attribute("title")))
driver.close() #关掉浏览器,节省内存
建议大家在所有代码写完之后在开启无头模式,博主之前没爬取一张图片都会开一个浏览器也不关,开着无头模式我也不知道,这样很耗内存。
def qvdian(s):
s=str(s) #将述据转化为字符串
ls=s.split("'") #以单引号为边界切开
s="".join(ls) #在直接合并成字符串
return s
在数据插入数据库是,如果里面由 ‘ 会导致插入失败,写一个小函数去掉这个单引号,修改一个bug。
总结
selenium的大部分知识就结束了,博主开始学习爬虫框架scrapy,学完之后基本就学完了python所有的爬虫内容,selinium文章结束。