当前位置: 首页>后端>正文

python selenium 如何 找到 页面框 输入 数据 selenium查找页面指定关键字

系列文章目录

第一章:html" class="superseo">selenium实现爬虫功能selenium爬取图片实例 第二章:selenium实现增量式爬虫功能增量式爬虫 第三章:selenium搜索关键字爬虫


前言

自上次爬取完了4K美女的图片之后,发现动漫栏里边也有很多好看的的美女壁纸,但是如果直接爬取整个动漫栏则会出现很多多余的图片,所以这次采用了搜索关键字进行爬取。


一、源代码

声名一下由于chrome的高级版本宣布不支持selenium所以博主将浏览器换成了Firefox。

import requests
from selenium import webdriver
import os
import pymysql

def hide():
    options = webdriver.FirefoxOptions()
    options.add_argument('-headless')
    driver = webdriver.Firefox(options=options)
    return driver

def Gethtml(url):
    driver=hide()
    driver.get(url)
    s = driver.find_elements_by_css_selector("div[class='slist'] li a")
    if str(s[-1].get_attribute("href")).split("/")[-2] == "4kmeinv":
        geturl(s[:-1])
    else:
        geturl(s)
    print(s[-1].get_attribute("href"))
    Gethtml(s[-1].get_attribute("href"))

def huoqvpicture(url):
    driver = webdriver.Chrome(options=hide())
    driver.get(url)
    s=driver.find_element_by_css_selector("div[class='photo-pic'] a img")
    print(s.get_attribute("title"))
    insert(url,s.get_attribute("src"),s.get_attribute("title"))
    GetPicture(str(s.get_attribute("src")),str(s.get_attribute("title")))

def GetPicture(url,name):
    root = "../dist/"
    path =root + name.replace(" ","")+".jpg"
    try:
        if not os.path.exists(root):
            os.mkdir(root)
        if not os.path.exists(path):
            r = requests.get(url)
            with open(path, 'wb') as f:
                f.write(r.content)
                f.close()
                print("文件保存成功")
        else:
            print("文件已存在")
    except:
        print("爬取失败")

def geturl(s):
    for i in s:
        print(i.get_attribute("href"))
        if not qvchong(i.get_attribute("href")):
            huoqvpicture(str(i.get_attribute("href")))

def insert(html,jpg,name):
    con = pymysql.connect(host="121.196.244.215", user="root", password="123456" ,port=3306 ,db="tupian",charset="utf8")
    cur=con.cursor()
    html=str(html)
    jpg=str(jpg)
    name=str(name)

    sql="insert into suoyin(html,jpg,name) values('"+html+"','"+jpg+"','"+name+"');"
    print("sql")
    #cur.execute(sql)
    con.commit()

def qvchong(i):
    con = pymysql.connect(host="121.196.244.215", user="root", password="123456", port=3306, db="tupian",
                          charset="utf8")
    cur=con.cursor()
    sql="select html from suoyin"
    cur.execute(sql)
    results = cur.fetchall()
    i=(str(i),)
    if i in results:
        print("数据已存在")
        return True
    else:
        return False

def main():
    url="https://pic.netbian.com/4kmeinv/index.html"
    Gethtml(url)

main()

二、修改后

import requests
from selenium import webdriver
import os
import pymysql
import time

def hide():
    options = webdriver.FirefoxOptions()
    options.add_argument('-headless')
    driver = webdriver.Firefox(options=options)
    return driver

def huoqvpicture(url):
    driver = hide()
    driver.get(url)
    s=driver.find_element_by_css_selector("div[class='photo-pic'] a img")
    print(s.get_attribute("title"))
    insert(url,s.get_attribute("src"),s.get_attribute("title"))
    GetPicture(str(s.get_attribute("src")),str(s.get_attribute("title")))
    driver.close()

def GetPicture(url,name):
    root = "../dist/"
    path =root + name.replace(" ","")+".jpg"
    try:
        if not os.path.exists(root):
            os.mkdir(root)
        if not os.path.exists(path):
            r = requests.get(url)
            with open(path, 'wb') as f:
                f.write(r.content)
                f.close()
                print("文件保存成功")
        else:
            print("文件已存在")
    except:
        print("爬取失败")

def geturl(s):
    for i in s:
        print(i.get_attribute("href"))
        if not qvchong(i.get_attribute("href")):
            huoqvpicture(str(i.get_attribute("href")))

def insert(html,jpg,name):
    con = pymysql.connect(host="121.196.244.215", user="root", password="123456" ,port=3306 ,db="tupian",charset="utf8")
    cur=con.cursor()
    html=str(html)
    jpg=str(jpg)
    name=qvdian(str(name))
    sql="insert into suoyin(html,jpg,name) values('"+html+"','"+jpg+"','"+name+"');"
    print("插入一条数据")
    cur.execute(sql)
    con.commit()

def qvchong(i):
    con = pymysql.connect(host="121.196.244.215", user="root", password="123456", port=3306, db="tupian",
                          charset="utf8")
    cur=con.cursor()
    sql="select html from suoyin"
    cur.execute(sql)
    results = cur.fetchall()
    i=(str(i),)
    if i in results:
        print("数据已存在")
        return True
    else:
        return False

def geisuo(driver,a):
    s = driver.find_elements_by_css_selector("div[class='slist'] li a")
    print(a)
    a = a + 1
    if str(s[-1].get_attribute("href")).split("/")[-2] == "result":
        geturl(s[:-1])
    else:
        geturl(s)
    bt=driver.find_element_by_class_name("nextpage")
    bt.click()
    geisuo(driver,a)

def click(url):
    driver = hide()
    driver.implicitly_wait(3)
    driver.get(url)
    keyboard = driver.find_element_by_name("keyboard")
    time.sleep(1)
    keyboard.send_keys("美女")
    bt = driver.find_element_by_name("submit")
    time.sleep(1)
    bt.click()
    geisuo(driver,1)

def qvdian(s):
    s=str(s)
    ls=s.split("'")
    s="".join(ls)
    return s


def main():
    url="https://pic.netbian.com/e/search/result/index.php?page=1&searchid=16"
    click(url)

main()

三、代码分析

def hide():
    options = webdriver.FirefoxOptions()
    options.add_argument('-headless')
    driver = webdriver.Firefox(options=options)
    return driver

对更换浏览器后对无头设置进行了简单的修改,现在是直接返回一个浏览器,提高了代码运行效率。

def click(url):
    driver = hide() #创建无头浏览器
    driver.implicitly_wait(3) #设计浏览器等待时间
    driver.get(url) #获取html页面
    keyboard = driver.find_element_by_name("keyboard")
    #找到搜索框
    time.sleep(1) #等待1秒(拟人化操作)
    keyboard.send_keys("美女")
    #搜索框输入相应的关键字
    bt = driver.find_element_by_name("submit")
    #找到搜索按钮
    time.sleep(1) #等待1秒(拟人化操作)
    bt.click() #点击搜索按钮
    geisuo(driver,1) #将浏览器和页数作为参数传递

这段代码可以说是整个新增代码中的核心代码,他负责进行关键字搜索。

def geisuo(driver,a):
    s = driver.find_elements_by_css_selector("div[class='slist'] li a") #获取相关的url(最后一个是下一页)
    print(a) #打印当前页页数,方便人了解进度
    a = a + 1 #页数加一
    if str(s[-1].get_attribute("href")).split("/")[-2] == "result":
    #查看是否到达最后一页
        geturl(s[:-1]) 
        #除最后一个进行爬取(未到达最后一页)
    else:
        geturl(s)
        #全部爬取(到达最后一页)
    bt=driver.find_element_by_class_name("nextpage")
    #找到下一页的点击按钮,进行翻页
    bt.click()
    #点击下一页
    geisuo(driver,a)
    #调用自身重新获取数据

之前翻页都是靠重新获取url,此次采用了鼠标点击。

def huoqvpicture(url):
    driver = hide()
    driver.get(url)
    s=driver.find_element_by_css_selector("div[class='photo-pic'] a img")
    print(s.get_attribute("title"))
    insert(url,s.get_attribute("src"),s.get_attribute("title"))
    GetPicture(str(s.get_attribute("src")),str(s.get_attribute("title")))
    driver.close() #关掉浏览器,节省内存

建议大家在所有代码写完之后在开启无头模式,博主之前没爬取一张图片都会开一个浏览器也不关,开着无头模式我也不知道,这样很耗内存。

def qvdian(s):
    s=str(s) #将述据转化为字符串
    ls=s.split("'") #以单引号为边界切开
    s="".join(ls) #在直接合并成字符串
    return s

在数据插入数据库是,如果里面由 ‘ 会导致插入失败,写一个小函数去掉这个单引号,修改一个bug。


总结

selenium的大部分知识就结束了,博主开始学习爬虫框架scrapy,学完之后基本就学完了python所有的爬虫内容,selinium文章结束。



https://www.xamrdz.com/backend/3bj1964039.html

相关文章: