当前位置：首页>大数据>正文

scrapy

大数据2024-10-22 11:56:19

scrapy genspider stats stats.gov.cn

settings.py

ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
   'myspider.pipelines.StatsPipeline': 300,
}

spider

import scrapy

from myspider.items import StatsItem


class StatsSpider(scrapy.Spider):
    name = 'stats'
    allowed_domains = ['stats.gov.cn']
    start_urls = ['http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html']

    def parse(self, response):
        print(response.xpath("/html/head/title/text()").extract_first())
        provinces = response.xpath("//tr[@class='provincetr']")
        # print("provinces:", len(provinces))
        for province in provinces:
            province_name = province.xpath('td/a/text()').extract_first()
            city_href = province.xpath('td/a/@href').extract_first()
            city_url = response.urljoin(city_href)
            yield scrapy.Request(url=city_url, callback=self.parse_city, meta={'province_name': province_name})
            break

    def parse_city(self, response):
        citys = response.xpath("//tr[@class='citytr']")
        for city in citys:
            city_ = city.xpath('td/a/text()').extract()
            ad_href = city.xpath('td/a/@href').extract_first()
            if city_:
                city_code = city_[0]
                city_name = city_[1]
                ad_url = response.urljoin(ad_href)
                # print(city_code, city_name, ad_url)
                yield scrapy.Request(url=ad_url, callback=self.parse_ad,
                                     meta={'city_code': city_code, 'city_name': city_name,
                                           'province_name': response.meta['province_name']})
                break

    def parse_ad(self, response):
        ads = response.xpath("//tr[@class='countytr']")
        for ad in ads:
            ad_ = ad.xpath('td/a/text()').extract()
            town_href = ad.xpath('td/a/@href').extract_first()
            if ad_:
                ad_code = ad_[0]
                ad_name = ad_[1]
                town_url = response.urljoin(town_href)
                # print(response.meta['province_name'], response.meta['city_name'], response.meta['city_code'],
                #       ad_name, ad_code, town_url)
                yield scrapy.Request(url=town_url, callback=self.parse_town,
                                     meta={'province_name': response.meta['province_name'],
                                           'city_code': response.meta['city_code'],
                                           'city_name': response.meta['city_name'],
                                           'ad_code': ad_code,
                                           'ad_name': ad_name
                                           })

    def parse_town(self, response):
        towns = response.xpath("//tr[@class='towntr']")
        for town in towns:
            town_ = town.xpath('td/a/text()').extract()
            community_href = town.xpath('td/a/@href').extract_first()
            if town_:
                town_code = town_[0]
                town_name = town_[1]
                community_url = response.urljoin(community_href)
                # print(response.meta['province_name'], response.meta['city_name'], response.meta['city_code'],
                #       response.meta['ad_name'], response.meta['ad_code'],
                #       town_code, town_name, community_url)
                yield scrapy.Request(url=community_url, callback=self.parse_community,
                                     meta={'province_name': response.meta['province_name'],
                                           'city_code': response.meta['city_code'],
                                           'city_name': response.meta['city_name'],
                                           'ad_code': response.meta['ad_code'],
                                           'ad_name': response.meta['ad_name'],
                                           "town_code": town_code,
                                           "town_name": town_name
                                           })

    def parse_community(self, response):
        communitys = response.xpath("//tr[@class='villagetr']")
        for community in communitys:
            community_ = community.xpath('td/text()').extract()
            if community_:
                community_code = community_[0]
                community_kind = community_[1]
                community_name = community_[2]
                print(response.meta['province_name'], response.meta['city_code'], response.meta['city_name'],
                      response.meta['ad_code'], response.meta['ad_name'],
                      response.meta['town_code'], response.meta['town_name'],
                      community_code, community_kind, community_name)

                statsItem = StatsItem()
                statsItem['prov_name'] = response.meta['province_name']
                statsItem['city_code'] = response.meta['city_code']
                statsItem['city_name'] = response.meta['city_name']
                statsItem['ad_code'] = response.meta['ad_code']
                statsItem['ad_name'] = response.meta['ad_name']
                statsItem['town_code'] = response.meta['town_code']
                statsItem['town_name'] = response.meta['town_name']
                statsItem['community_code'] = community_code
                statsItem['community_kind'] = community_kind
                statsItem['community_name'] = community_name

                yield statsItem

pipelines.py

import json

class MyspiderPipeline:
    def process_item(self, item, spider):
        return item


class StatsPipeline:
    def __init__(self):
        self.file = open("a.json", 'w')

    def process_item(self, item, spider):
        print('StatsPipeline:', item)
        self.file.write(json.dumps(dict(item), ensure_ascii=False))
        self.file.write("\n")

    def __del__(self):
        self.file.close()

items.py

import scrapy


class MyspiderItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    pass


class StatsItem(scrapy.Item):
    # define the fields for your item here like:
    prov_name = scrapy.Field()
    city_code = scrapy.Field()
    city_name = scrapy.Field()
    ad_code = scrapy.Field()
    ad_name = scrapy.Field()
    town_code = scrapy.Field()
    town_name = scrapy.Field()
    community_code = scrapy.Field()
    community_kind = scrapy.Field()
    community_name = scrapy.Field()

查看全文

https://www.xamrdz.com/bigdata/72u1996308.html

相关文章：