python爬虫的去重策略
1、将访问过的url保存到数据库中
2、将访问过的url保存到set中
优点:只需要 o(1)的代价就可以查询 URL
缺点:对内存要求高。若有 1 亿网页,则占用内存为:1000000000*2byte*50 个字符/1024/1024/1024 = 9G
3、URL 经过 md5 等方法哈希后保存到 set 中
优点:可以成倍降低内存占用,Scrapy 使用的这种方法
4、用bitmap或者bloomfilter方法,将访问过的URL通过hash函数映射到某一位
bitmap 方法优点:一亿 URL 占用约 12M
bitmap 方法:去重没那么精准,存在冲突。
bloomfilter 优点:对 bitmap 进行改进,多重 hash 函数降低冲突
第一种方法案例
import requests
from bs4 import BeautifulSoup
import re
import pymongo
class KongjieSpider:
def __init__(self):
server = 'localhost'
port = '27017'
dbname = 'admin'
user = 'admin'
pwd = '123'
uri = 'mongodb://' + user + ':' + pwd + '@' + server + ':' + port + '/' + dbname
client = pymongo.MongoClient(uri) ##与MongDB建立连接
db = client['dbkongjie1805'] ## 选择一个数据库
self.kongjie_collection = db['kongjie'] ##在数据库中,选择一个集合
def getUA(self):
user_agent = 'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)'
headers = {'User-Agent': user_agent}
return headers
def parse_album_url(self,url):
"""
解析出相册url,然后进入相册爬取图片
"""
headers = self.getUA()
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
people_list = soup.select('div.ptw > ul > li')
for people in people_list:
self.save_images_in_album(people.div.a['href'])
#爬取下一页
next_page = soup.select_one('a.nxt')
if next_page:
self.parse_album_url(next_page['href'])
else:
print('下载结束!')
def save_images_in_album(self,album_url):
"""
进入空姐网用户的相册,开始一张一张的保存相册中的图片。
"""
headers = self.getUA()
response = requests.get(album_url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
ls = soup.select('ul.ptw.ml.mlp.cl > li')
if len(ls)>0:
print('len ',len(ls))
for item in ls:
# 提取照片页面的url
url = item.select_one('a')['href']
#去重操作
if self.kongjie_collection.find_one({'img_url': url}): ##判断这个主题是否已经在数据库中、不在就运行else下的内容,在则忽略。
print(u'这个页面已经爬取过了')
else:
pat = re.compile(r'uid=(\d+)&.*?picid=(\d+)')
matchObj = pat.search(url)
uid = matchObj.group(1)
picid = matchObj.group(2)
print('uid:',uid)
print('picid:',picid)
# 打开照片页面,提取image的src属性
response = requests.get(url, headers=headers)
soup1 = BeautifulSoup(response.text, 'lxml')
img_url = soup1.select_one('div#photo_pic > a > img')['src']
# 下载图片
response = requests.get(img_url, headers=headers)
imgName = './images/'+ uid + picid +'.jpg'
with open(imgName,'wb') as file:
file.write(response.content)
self.kongjie_collection.save({'img_url': url})
ls = soup.select('a.nxt')
print('next_page: ',len(ls))
if len(ls)>0:
next_page_url = ls[0]['href']
print('next_page_url:',next_page_url)
save_images_in_album(next_page_url)
if __name__ == '__main__':
start_url='http://www.kongjie.com/home.php?mod=space&do=album&view=all&page=1'
spider = KongjieSpider()
spider.parse_album_url(start_url)
第二种案例:
import requests
from bs4 import BeautifulSoup
import re
import pymongo
class KongjieSpider:
def __init__(self):
server = 'localhost'
port = '27017'
dbname = 'admin'
user = 'admin'
pwd = '123'
uri = 'mongodb://' + user + ':' + pwd + '@' + server + ':' + port + '/' + dbname
client = pymongo.MongoClient(uri) ##与MongDB建立连接(这是默认连接本地MongDB数据库)
db = client['dbkongjie'] ## 选择一个数据库
self.kongjie_collection = db['kongjie'] ##在数据库中,选择一个集合
self.img_urls = set() ##初始化一个集合 用来保存图片地址
def getUA(self):
user_agent = 'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)'
headers = {'User-Agent': user_agent}
return headers
def parse_album_url(self,url):
"""
解析出相册url,然后进入相册爬取图片
"""
headers = self.getUA()
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
people_list = soup.select('div.ptw > ul > li')
for people in people_list:
self.save_images_in_album(people.div.a['href'])
#爬取下一页
next_page = soup.select_one('a.nxt')
if next_page:
self.parse_album_url(next_page['href'])
else:
print('下载结束!')
def save_images_in_album(self,album_url):
"""
进入空姐网用户的相册,开始一张一张的保存相册中的图片。
"""
headers = self.getUA()
response = requests.get(album_url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
ls = soup.select('ul.ptw.ml.mlp.cl > li')
if len(ls)>0:
print('len ',len(ls))
for item in ls:
# 提取照片页面的url
url = item.select_one('a')['href']
#去重操作
if url in self.img_urls: ##判断照片页面的url是否已经存在于集合中,不在就运行else下的内容,在则忽略。
print(u'这个页面已经爬取过了')
else:
pat = re.compile(r'uid=(\d+)&.*?picid=(\d+)')
matchObj = pat.search(url)
uid = matchObj.group(1)
picid = matchObj.group(2)
print('uid:',uid)
print('picid:',picid)
# 打开照片页面,提取image的src属性
response = requests.get(url, headers=headers)
soup1 = BeautifulSoup(response.text, 'lxml')
img_url = soup1.select_one('div#photo_pic > a > img')['src']
# 下载图片
response = requests.get(img_url, headers=headers)
imgName = './images/'+ uid + picid +'.jpg'
with open(imgName,'wb') as file:
file.write(response.content)
self.img_urls.add(url) # 把url添加到集合中
print('count:',len(self.img_urls))
ls = soup.select('a.nxt')
print('next_page: ',len(ls))
if len(ls)>0:
next_page_url = ls[0]['href']
print('next_page_url:',next_page_url)
save_images_in_album(next_page_url)
if __name__ == '__main__':
start_url='http://www.kongjie.com/home.php?mod=space&do=album&view=all&page=1'
spider = KongjieSpider()
spider.parse_album_url(start_url)
第三种案例
import requests
from bs4 import BeautifulSoup
import re
import pymongo
from hashlib import md5
class KongjieSpider:
def __init__(self):
server = 'localhost'
port = '27017'
dbname = 'admin'
user = 'admin'
pwd = '123'
uri = 'mongodb://' + user + ':' + pwd + '@' + server + ':' + port + '/' + dbname
client = pymongo.MongoClient(uri) ##与MongDB建立连接(这是默认连接本地MongDB数据库)
db = client['dbkongjie'] ## 选择一个数据库
self.kongjie_collection = db['kongjie'] ##在数据库中,选择一个集合
self.img_urls = set() ##初始化一个集合 用来保存图片地址
def getUA(self):
user_agent = 'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)'
headers = {'User-Agent': user_agent}
return headers
def parse_album_url(self,url):
"""
解析出相册url,然后进入相册爬取图片
"""
headers = self.getUA()
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
people_list = soup.select('div.ptw > ul > li')
for people in people_list:
self.save_images_in_album(people.div.a['href'])
#爬取下一页
next_page = soup.select_one('a.nxt')
if next_page:
self.parse_album_url(next_page['href'])
else:
print('下载结束!')
def save_images_in_album(self,album_url):
"""
进入空姐网用户的相册,开始一张一张的保存相册中的图片。
"""
headers = self.getUA()
response = requests.get(album_url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
ls = soup.select('ul.ptw.ml.mlp.cl > li')
if len(ls)>0:
print('len ',len(ls))
for item in ls:
# 提取照片页面的url
url = item.select_one('a')['href']
#去重操作
hash_md5 = md5(url.encode('utf8'))
hash_str = hash_md5.hexdigest()
if hash_str in self.img_urls: ##判断照片页面的url是否已经存在于集合中,不在就运行else下的内容,在则忽略。
print(u'这个页面已经爬取过了')
else:
pat = re.compile(r'uid=(\d+)&.*?picid=(\d+)')
matchObj = pat.search(url)
uid = matchObj.group(1)
picid = matchObj.group(2)
print('uid:',uid)
print('picid:',picid)
# 打开照片页面,提取image的src属性
response = requests.get(url, headers=headers)
soup1 = BeautifulSoup(response.text, 'lxml')
img_url = soup1.select_one('div#photo_pic > a > img')['src']
# 下载图片
response = requests.get(img_url, headers=headers)
imgName = './images/'+ uid + picid +'.jpg'
with open(imgName,'wb') as file:
file.write(response.content)
self.img_urls.add(hash_str)
print('count:',len(self.img_urls),hash_str)
ls = soup.select('a.nxt')
print('next_page: ',len(ls))
if len(ls)>0:
next_page_url = ls[0]['href']
print('next_page_url:',next_page_url)
save_images_in_album(next_page_url)
if __name__ == '__main__':
start_url='http://www.kongjie.com/home.php?mod=space&do=album&view=all&page=1'
spider = KongjieSpider()
spider.parse_album_url(start_url)
第四种案例
import requests
from bs4 import BeautifulSoup
import re
import pymongo
from pybloom_live import ScalableBloomFilter
class KongjieSpider:
def __init__(self):
server = 'localhost'
port = '27017'
dbname = 'admin'
user = 'admin'
pwd = '123'
uri = 'mongodb://' + user + ':' + pwd + '@' + server + ':' + port + '/' + dbname
client = pymongo.MongoClient(uri) ##与MongDB建立连接(这是默认连接本地MongDB数据库)
db = client['dbkongjie'] ## 选择一个数据库
self.kongjie_collection = db['kongjie'] ##在数据库中,选择一个集合
self.sbf = ScalableBloomFilter(initial_capacity=100, error_rate=0.001, mode=ScalableBloomFilter.LARGE_SET_GROWTH)
def getUA(self):
user_agent = 'Mozilla/5.0 (compatible; WOW64; MSIE 10.0; Windows NT 6.2)'
headers = {'User-Agent': user_agent}
return headers
def parse_album_url(self,url):
"""
解析出相册url,然后进入相册爬取图片
"""
headers = self.getUA()
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
people_list = soup.select('div.ptw > ul > li')
for people in people_list:
self.save_images_in_album(people.div.a['href'])
#爬取下一页
next_page = soup.select_one('a.nxt')
if next_page:
self.parse_album_url(next_page['href'])
else:
print('下载结束!')
def save_images_in_album(self,album_url):
"""
进入空姐网用户的相册,开始一张一张的保存相册中的图片。
"""
headers = self.getUA()
response = requests.get(album_url, headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
ls = soup.select('ul.ptw.ml.mlp.cl > li')
if len(ls)>0:
print('len ',len(ls))
for item in ls:
# 提取照片页面的url
url = item.select_one('a')['href']
#去重操作
if url in self.sbf: ##判断照片页面的url是否已经存在于集合中,不在就运行else下的内容,在则忽略。
print(u'这个页面已经爬取过了')
else:
pat = re.compile(r'uid=(\d+)&.*?picid=(\d+)')
matchObj = pat.search(url)
uid = matchObj.group(1)
picid = matchObj.group(2)
print('uid:',uid)
print('picid:',picid)
# 打开照片页面,提取image的src属性
response = requests.get(url, headers=headers)
soup1 = BeautifulSoup(response.text, 'lxml')
img_url = soup1.select_one('div#photo_pic > a > img')['src']
# 下载图片
response = requests.get(img_url, headers=headers)
imgName = './images/'+ uid + picid +'.jpg'
with open(imgName,'wb') as file:
file.write(response.content)
self.sbf.add(url)
print('count:',len(self.sbf))
ls = soup.select('a.nxt')
print('next_page: ',len(ls))
if len(ls)>0:
next_page_url = ls[0]['href']
print('next_page_url:',next_page_url)
save_images_in_album(next_page_url)
if __name__ == '__main__':
start_url='http://www.kongjie.com/home.php?mod=space&do=album&view=all&page=1'
spider = KongjieSpider()
spider.parse_album_url(start_url)