当前位置: 首页>编程语言>正文

python带你采集当当网商品及评论数据并实现词云图

鍓嶈█ 馃構

鍡ㄥ柦锛屽ぇ瀹跺ソ鍛€~杩欓噷鏄埍鐪嬬編濂崇殑鑼滆寽鍛?/p>

鏈閲囬泦缃戜粙缁嶏細鍥句功棰戦亾-鍏ㄧ悆鏈€澶т腑鏂囩綉涓婁功搴?/p>

涓撲笟鎻愪緵灏忚浼犺,闈掓槬鏂囧,鎴愬姛鍔卞織,鎶曡祫鐞嗚储绛夊悇鍝佺被鍥句功

鐣呴攢姒滄渶鏂版姤浠枫€佷績閿€銆佽瘎璁轰俊鎭?寮曢鏈€鏂扮綉涓婅喘涔︿綋楠?

python带你采集当当网商品及评论数据并实现词云图,第1张

鐜浣跨敤 馃巿:

  • Python 3.8

  • Pycharm

妯″潡浣跨敤 馃帬:

  • requests >>> pip install requests

  • parsel >>> pip install parsel

  • csv

鐖櫕鍩烘湰鎬濊矾娴佺▼ 馃帄:

涓€. 鏁版嵁鏉ユ簮鍒嗘瀽

  1. 纭畾鑷繁閲囬泦鏁版嵁鍐呭
  2. 鎶撳寘鍒嗘瀽,鑷繁鎯宠鏁版嵁鏉ヨ嚜鍝噷 ---> 璇锋眰閭d釜url鍦板潃寰楀埌鎯宠鐨勬暟鎹?/li>
  • 寮€鍙戣€呭伐鍏锋姄鍖呭垎鏋?F12 鎴栬€?榧犳爣鍙抽敭鐐瑰嚮妫€鏌?閫夋嫨 network(缃戠粶), 鍒锋柊缃戦〉
  • 閫氳繃鍏抽敭瀛?鎴戜滑鎯宠鏁版嵁姣斿: 涔﹀悕) 鍘绘悳绱㈡暟鎹寘鏄偅涓?---> 纭畾璇锋眰鏄偅涓綉鍧€寰楀埌鏁版嵁鍐呭

璇锋眰杩欎釜缃戠珯 灏卞彲浠ュ緱鍒版垜浠兂瑕佹暟鎹唴瀹?/p>

浜? 浠g爜瀹炵幇姝ラ:

  1. 鍙戦€佽姹? 妯℃嫙娴忚鍣ㄥ浜巙rl鍙戦€佽姹?/p>

  2. 鑾峰彇鏁版嵁, 鑾峰彇鏈嶅姟鍣ㄨ繑鍥炲搷搴旀暟鎹?---> 寮€鍙戣€呭伐鍏烽噷闈esponse

  3. 瑙f瀽鏁版嵁, 鎻愬彇鎴戜滑鎯宠鏁版嵁鍐呭, 涔︾睄鍩烘湰淇℃伅

  4. 淇濆瓨鏁版嵁, 淇濆瓨琛ㄦ牸閲岄潰

python带你采集当当网商品及评论数据并实现词云图,第2张

鏁版嵁閲囬泦 馃帰

# 瀵煎叆鏁版嵁璇锋眰妯″潡  ---> 绗笁鏂规ā鍧?闇€瑕?鍦╟md 閲岄潰 pip install requests
import requests
# 瀵煎叆鏁版嵁瑙f瀽妯″潡 ---> 绗笁鏂规ā鍧?闇€瑕?鍦╟md 閲岄潰 pip install parsel
import parsel
# 瀵煎叆csv妯″潡 ---> 鍐呯疆妯″潡 涓嶉渶瑕佸畨瑁?
import csv

# 鍒涘缓鏂囦欢
f = open('涔︾睄data25椤?csv', mode='a', encoding='utf-8', newline='')
# f鏂囦欢瀵硅薄 fieldnames 瀛楁鍚?---> 琛ㄦ牸绗竴琛?浣滀负琛ㄥご
csv_writer = csv.DictWriter(f, fieldnames=[
    '鏍囬',
    '璇勮',
    '鎺ㄨ崘',
    '浣滆€?,
    '鏃ユ湡',
    '鍑虹増绀?,
    '鍞环',
    '鍘熶环',
    '鎶樻墸',
    '鐢靛瓙涔?,
    '璇︽儏椤?,
])
# 婧愮爜銆佽В绛斻€佹暀绋嬪姞Q瑁欙細261823976
# 鍐欏叆琛ㄥご
csv_writer.writeheader()
"""
1. 鍙戦€佽姹? 妯℃嫙娴忚鍣ㄥ浜巙rl鍙戦€佽姹?
    - 绛夊彿宸﹁竟鏄畾涔夊彉閲忓悕
    - 妯℃嫙娴忚鍣?---> 璇锋眰澶?
        headers ---> 鍦ㄥ紑鍙戣€呭伐鍏烽噷闈㈠鍒剁矘璐?瀛楀吀鏁版嵁绫诲瀷
        涓€绉嶇畝鍗曞弽鍙嶇埇鎵嬫, 闃叉琚湇鍔″櫒璇嗗埆鍑烘潵鏄埇铏▼搴?
    - 浣跨敤浠€涔堣姹傛柟寮? 鏍规嵁寮€鍙戣€呭伐鍏锋潵鐨?
"""
for page in range(1, 26): #  1,26 鏄彇1-25鐨勬暟瀛? 涓嶅寘鍚?6
    # 纭畾璇锋眰缃戝潃
    url = f'http://bang.dangdang.com/books/bestsellers/01.00.00.00.00.00-recent7-0-0-1-{page}'
    # 妯℃嫙娴忚鍣?---> 璇锋眰澶?
    headers = {
        # User-Agent 鐢ㄦ埛浠g悊 琛ㄧず娴忚鍣ㄥ熀鏈韩浠芥爣璇?
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36'
    }
    # 鍙戦€佽姹?杩斿洖鐨勫搷搴斿璞?---> <Response [200]>: <> 琛ㄧず瀵硅薄  response 鍝嶅簲鍥炲  200鐘舵€佺爜 琛ㄧず璇锋眰鎴愬姛
    response = requests.get(url=url, headers=headers)
    print(response)
    # 2. 鑾峰彇鏁版嵁, 鑾峰彇鏈嶅姟鍣ㄨ繑鍥炲搷搴旀暟鎹?---> 寮€鍙戣€呭伐鍏烽噷闈?response  print(response.text)
    """
    3. 瑙f瀽鏁版嵁, 鎻愬彇鎴戜滑鎯宠鏁版嵁鍐呭, 涔︾睄鍩烘湰淇℃伅
    鏍规嵁寰楀埌鏁版嵁绫诲瀷浠ュ強鎴戜滑鎯宠鏁版嵁鍐呭, 閫夋嫨鏈€閫傚悎瑙f瀽鏂规硶:
        - re姝e垯琛ㄨ揪寮?
        - css閫夋嫨鍣?
        - xpath
    xpath --->  鏍规嵁鏍囩鑺傜偣鎻愬彇鏁版嵁
    css閫夋嫨鍣?---> 鏍规嵁鏍囩灞炴€ф彁鍙栨暟鎹唴瀹?
        css璇硶鍖归厤  涓嶄細 1  浼氱殑 2
        澶嶅埗绮樿创浼氫笉浼?---> ctrl + C  ctrl + v
    """
    # 杞暟鎹被鍨?<Selector xpath=None data='<html xmlns="http://www.w3.org/1999/x...'>
    selector = parsel.Selector(response.text)
    # 绗竴娆℃彁鍙?鎻愬彇鎵€鏈塴i鏍囩 --> 杩斿洖鍒楄〃, 鍏冪礌Selector瀵硅薄
    lis = selector.css('.bang_list_mode li')
    # for寰幆閬嶅巻 涔嬪悗杩涜浜屾鎻愬彇 鎴戜滑鎯宠鍐呭
    for li in lis:
        """
        attr() 灞炴€ч€夋嫨鍣?
        a::attr(title) ---> 鑾峰彇a鏍囩閲岄潰title灞炴€?
        get() 鑾峰彇涓€涓?绗竴涓?
        """
        title = li.css('.name a::attr(title)').get()  # 鏍囬
        star = li.css('.star a::text').get().replace('鏉¤瘎璁?, '')  # 璇勮
        recommend = li.css('.tuijian::text').get().replace('鎺ㄨ崘', '')  # 鎺ㄨ崘
        author = li.css('.publisher_info a::attr(title)').get()  # 浣滆€?
        date = li.css('.publisher_info span::text').get()  # 鏃ユ湡
        press = li.css('div:nth-child(6) a::text').get()  # 鍑虹増绀?
        price_n = li.css('.price .price_n::text').get()  # 鍞环
        price_r = li.css('.price .price_r::text').get()  # 鍘熶环
        price_s = li.css('.price .price_s::text').get().replace('鎶?, '')  # 鎶樻墸
        price_e = li.css('.price .price_e .price_n::text').get()  # 鐢靛瓙涔?
        href = li.css('.name a::attr(href)').get()  # 璇︽儏椤?
        # 淇濆瓨鏁版嵁
        婧愮爜銆佽В绛斻€佹暀绋嬪姞Q瑁欙細261823976
        dit = {
            '鏍囬': title,
            '璇勮': star,
            '鎺ㄨ崘': recommend,
            '浣滆€?: author,
            '鏃ユ湡': date,
            '鍑虹増绀?: press,
            '鍞环': price_n,
            '鍘熶环': price_r,
            '鎶樻墸': price_s,
            '鐢靛瓙涔?: price_e,
            '璇︽儏椤?: href,
        }
        # 鍐欏叆鏁版嵁
        csv_writer.writerow(dit)
        print(title, star, recommend, author, date, press, price_n, price_r, price_s, price_e, href, sep=' | ')

璇勮 馃帳

# 瀵煎叆鏁版嵁璇锋眰妯″潡
import time
import requests
import re
for page in range(1, 11):
    time.sleep(1.5)
    # 纭畾缃戝潃
    婧愮爜銆佽В绛斻€佹暀绋嬪姞Q瑁欙細261823976
    url = 'http://product.dangdang.com/index.php'
    # 璇锋眰鍙傛暟
    data = {
        'r': 'comment/list',
        'productId': '27898031',
        'categoryPath': '01.43.77.07.00.00',
        'mainProductId': '27898031',
        'mediumId': '0',
        'pageIndex': page,
        'sortType': '1',
        'filterType': '1',
        'isSystem': '1',
        'tagId': '0',
        'tagFilterCount': '0',
        'template': 'publish',
        'long_or_short': 'short',
    }
    headers = {
        'Cookie': '__permanent_id=20220526142043051185927786403737954; dest_area=country_id%3D9000%26province_id%3D111%26city_id%20%3D0%26district_id%3D0%26town_id%3D0; ddscreen=2; secret_key=f4022441400c500aa79d59edd8918a6e; __visit_id=20220723213635653213297242210260506; __out_refer=; pos_6_start=1658583812022; pos_6_end=1658583812593; __trace_id=20220723214559176959858324136999851; __rpm=p_27898031.comment_body..1658583937494%7Cp_27898031.comment_body..1658583997600',
        'Host': 'product.dangdang.com',
        'Referer': 'http://product.dangdang.com/27898031.html',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.0.0 Safari/537.36',
    }
    response = requests.get(url=url, params=data, headers=headers)
    html_data = response.json()['data']['list']['html']
    content_list = re.findall("<span><a href='.*?' target='_blank'>(.*?)</a></span>", html_data)
    for content in content_list:
        with open('璇勮.txt', mode='a', encoding='utf-8') as f:
            f.write(content)
            f.write('\n')
        print(content)

璇嶄簯鍥?馃儚

import jieba
import wordcloud
import imageio
# 璇诲彇鍥剧墖
py = imageio.imread('python.png')
# 鎵撳紑鏂囦欢
f = open('璇勮.txt', encoding='utf-8')
# 璇诲彇鍐呭
txt = f.read()
# jieba妯″潡杩涜鍒嗚瘝  ---> 鍒楄〃
txt_list = jieba.lcut(txt)
print(txt_list)
# join鎶婂垪琛ㄥ悎鎴愬瓧绗︿覆
string = ' '.join(txt_list)
# 浣跨敤璇嶄簯搴?
wc = wordcloud.WordCloud(
    height=300,  # 楂樺害
    width=500,  # 瀹藉害
    background_color='white',  # 鑳屾櫙棰滆壊
    font_path='msyh.ttc',  # 瀛椾綋
    scale=15, # 杞粨
    stopwords={'鐨?, '浜?, '寰?, '涔?},  # 鍋滅敤璇?
    mask=py  # 鑷畾涔夎瘝浜戝浘鏍峰紡
)
wc.generate(string)  # 闇€瑕佸仛璇嶄簯鏁版嵁浼犲叆杩涘幓
wc.to_file('1.png')  # 杈撳叆鍥剧墖

灏捐 馃挐

鎰熻阿浣犺鐪嬫垜鐨勬枃绔犲憪~鏈鑸彮鍒拌繖閲屽氨缁撴潫鍟? 馃洭

甯屾湜鏈瘒鏂囩珷鏈夊浣犲甫鏉ュ府鍔?馃帀锛屾湁瀛︿範鍒颁竴鐐圭煡璇唦

韬茶捣鏉ョ殑鏄熸槦馃崶涔熷湪鍔姏鍙戝厜锛屼綘涔熻鍔姏鍔犳补锛堣鎴戜滑涓€璧峰姫鍔涘彮锛夈€?/p>

涓嶇煡閬撹瘎璁哄暐鐨勶紝鍗充娇鎵d釜6666涔熸槸瀵瑰崥涓荤殑榧撹垶鍚?馃挒 鎰熻阿 馃拹


https://www.xamrdz.com/lan/5xu1997265.html

相关文章: