python爬虫笔记

python爬虫笔记

Scroll Down
小提示,本文编写于  2,108  天前,最后编辑于  1,760  天前,某些信息可能有些出入,仅供参考。

python爬虫笔记

最近有个需求,项目组长安排我弄两份国家行政区划的数据文件,excel或者sql文件都行,一份中国的一份南非的,这可把我给整蒙了,中国的还好,爬一下国家统计局行政区划2019年的官网就好了,南非的从哪整去。

话不多说,找了很多资料,发现要么不全,要么不是最新,他们的数据肯定也是自己弄来的,我为啥就不能自己也爬一波,说干就干,先来爬中国行政区划统计数据:

首先是国家统计局官网:
我们要爬取的地址就是介个,

http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/

我们先琢磨下这个网站是怎么布局的:

11314

通过分析首页源码我们可以得到如下3点:

页面的整个布局是用的table标签来控制的,也就是说我们如果要通过beautifulsoup来抓取信息,那么一定要注意,上图中不是只要标注了省市地区的地方采用的才是表格,整个页面中存在多个表格,因此是不可以直接通过表格

    print('开始抓取省份信息……')
    province_url = url + index_href
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    request = requests.get(province_url, headers=headers)
    request.encoding = 'gbk'
    province_html_text = str(request.text)
    soup = BeautifulSoup(province_html_text, "html.parser")
    province_tr_list = soup.select('.provincetr a')

再次我们再看一下一般的数据页面(一般的数据页面包括市级、县级、镇级这三级数据展示页面):

  之所以要把上述三个页面放在一起,是因为通过分析我们可以发现,这三级数据的数据页面完全一致,唯一不同的就是在html源码数据表格中的数据行tr的class属性不一致,分别对应为:citytr,countrytrhe towntr。其他均一致。这样我们就可以用一个通用的方法解决这三个页面的数据爬取.
111926114
遍历市级别的信息:

 city_url = url + province_href
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    request = requests.get(city_url, headers=headers)
    request.encoding = 'gbk'
    city_html_text = str(request.text)
    soup = BeautifulSoup(city_html_text, "html.parser")
    city_tr_list = soup.select('.citytr')
    # 遍历市级城市列表信息

然后遍历区级别的信息:

    for city_tr in city_tr_list:
        if city_tr:
            file = open('mysql_v2/area.sql', 'a+', encoding='utf-8')
            city_a_info = city_tr.select('a')
            city_href = city_a_info[0].attrs['href']
            city_code = city_a_info[0].text[:6]
            city_name = city_a_info[1].text
            city_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(city_code) + '", "' + str(city_name) + '", "' + str(province_code) + '", "' + str(level) + '");\n'
            file.write(city_info)
            file.close()
            print('已写入市级:', city_info)
            # 区级
            get_area(city_href, city_code)

数据保存为sql格式,就只需要导入数据库就OK了~

province_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(province_code) + '", "' + str(province_name) + '", "' + str(parent_code) + '", "' + str(level) + '");\n'

city_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(city_code) + '", "' + str(city_name) + '", "' + str(province_code) + '", "' + str(level) + '");\n'

area_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(area_code) + '", "' + str(area_name) + '", "' + str(city_code) + '", "' + str(level) + '");\n'

最后附上完整的代码:

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
爬取国家统计局最新地址库
省市区三级(一张表)
author: icefire
time: 2019-03-13
"""

import requests
from bs4 import BeautifulSoup
import os


def get_province(index_href):
    """抓取省份信息"""
    print('开始抓取省份信息……')
    province_url = url + index_href
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    request = requests.get(province_url, headers=headers)
    request.encoding = 'gbk'
    province_html_text = str(request.text)
    soup = BeautifulSoup(province_html_text, "html.parser")
    province_tr_list = soup.select('.provincetr a')
    # 遍历省份列表信息
    level = '1'
    parent_code = ''
    for province_tr in province_tr_list:
        if province_tr:
            file = open('china_data/area.sql', 'a+', encoding='utf-8')
            province_href = province_tr.attrs['href']
            province_no = province_href.split('.')[0]
            province_code = province_no + '0000'
            province_name = province_tr.text
            province_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(province_code) + '", "' + str(province_name) + '", "' + str(parent_code) + '", "' + str(level) + '");\n'
            file.write(province_info)
            file.close()
            print('已写入省级:', province_info)
            # 市级
            get_city(province_href, province_code)
    print('抓取省份信息结束!')


def get_city(province_href, province_code):
    """抓取市级城市信息"""
    print('开始抓取市级信息')
    city_url = url + province_href
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    request = requests.get(city_url, headers=headers)
    request.encoding = 'gbk'
    city_html_text = str(request.text)
    soup = BeautifulSoup(city_html_text, "html.parser")
    city_tr_list = soup.select('.citytr')
    # 遍历市级城市列表信息
    level = '2'
    for city_tr in city_tr_list:
        if city_tr:
            file = open('china_data/area.sql', 'a+', encoding='utf-8')
            city_a_info = city_tr.select('a')
            city_href = city_a_info[0].attrs['href']
            city_code = city_a_info[0].text[:6]
            city_name = city_a_info[1].text
            city_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(city_code) + '", "' + str(city_name) + '", "' + str(province_code) + '", "' + str(level) + '");\n'
            file.write(city_info)
            file.close()
            print('已写入市级:', city_info)
            # 区级
            get_area(city_href, city_code)
    print('抓取市级城市结束!')


def get_area(city_href, city_code):
    """抓取区级信息"""
    print('开始抓取区级信息')
    area_url = url + city_href
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
    }
    request = requests.get(area_url, headers=headers)
    request.encoding = 'gbk'
    area_html_text = str(request.text)
    soup = BeautifulSoup(area_html_text, "html.parser")
    area_tr_list = soup.select('.countytr')
    # 遍历区级列表信息
    file = open('china_data/area.sql', 'a+', encoding='utf-8')
    level = '3'
    for area_tr in area_tr_list:
        area_a_info = area_tr.select('td')
        if area_a_info:
            area_code = area_a_info[0].text[:6]
            area_name = area_a_info[1].text
            area_info = 'INSERT INTO area (code, name, parent_code, level) VALUES ("' + str(area_code) + '", "' + str(area_name) + '", "' + str(city_code) + '", "' + str(level) + '");\n'
            file.write(area_info)
            print('已写入区级:', area_info)
    print('抓取区级信息结束!')
    file.close()


# 程序主入口
if __name__ == "__main__":
    url = 'https://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/'
    # 创建json目录
    mysql_folder = 'china_data/'
    if not os.path.exists(mysql_folder):
        os.makedirs(mysql_folder)
    else:
        # 清空城市和地区
        city_file = open('china_data/area.sql', 'w', encoding='utf-8')
        city_file.write('')
        city_file.close()
    get_province('index.html')


OK,搞定,至于南非的行政区划,再琢磨琢磨,之后找到解决方案了再补上。

使用豆瓣源 安装 插件库

pip install requests -i "https://pypi.doubanio.com/simple/"

爬取网易云音乐歌曲评论

实现有点曲折,直接贴代码实现了,代码中配有注释

#v2.0
import base64
import codecs
import sys

import json
import os
import requests
import Crypto
from Crypto.Cipher import AES


class Spider():

    def __init__(self,idNum):
        #user-Agent字段直接从浏览器中复制过来即可,请求头中其他字段非必须项,也可以从浏览器中找到所有字段都放到Request Headers
        self.header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0',
                       'Referer': 'http://music.163.com/'}
        self.url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_'+idNum+'?csrf_token='   #每一次的base_url只有歌曲id不同,构造url即可。

    def __get_jsons(self,url,page):
        # 获取两个参数
        music = WangYiYun()
        text = music.create_random_16()
        params = music.get_params(text,page)

        encSecKey = music.get_encSEcKey(text)
        fromdata = {'params' : params,'encSecKey' : encSecKey}
        jsons = requests.post(url, data=fromdata, headers=self.header)
        #print(jsons.raise_for_status())
        # 打印返回来的内容,是个json格式的
        #print(jsons.content)
        return jsons.text

    def json2list(self,jsons):
        '''把json转成字典,并把他重要的信息获取出来存入列表'''
        # 可以用json.loads()把它转成字典
        #print(json.loads(jsons.text))
        users = json.loads(jsons)
        comments = []
        #print (users)
        for user in users['comments']:

            name = user['user']['nickname']
            content = user['content']
            times=user['time']
            userid = user['user']['userId']

            # 点赞数
            likedCount = user['likedCount']
            #提取所需json中所需的字段构造字典
            user_dict = {'userid':userid,'name': name, 'content': content, 'times':times}
            #将提取的字典信息追加到列表中
            #print(user_dict)
            comments.append(user_dict)
            #comments.append(idNum)
           # print(comments)
        return comments

    def run(self,idNum):
        self.page = 1
        while True:
            jsons = self.__get_jsons(self.url,self.page)
            comments = self.json2list(jsons)
            non_bmp_map = dict.fromkeys(range(0x10000, sys.maxunicode + 1), 0xfffd)

##            print(str(comments[0]).translate(non_bmp_map))
            print('self.page = '+str(self.page)) #控制台打印正在爬取的页码数
            print(idNum) #打印正在爬取的歌曲id
            #在该脚本同级目录下生成“comments”文件夹
            dirName = u'{}'.format('comments')
            if not os.path.exists(dirName):
                os.makedirs(dirName)
            with open(".\comments\\"+idNum+".csv","a",encoding='utf-8') as f:  #结果写入txt文件
##                print(len(comments))
                for ii in range(len(comments)):
                    f.write(str(comments[ii]).translate(non_bmp_map))
                    f.write('\n')
                    f.write(idNum)
##                    print(ii)
                f.close()
            # 当这一页的评论数少于20条时,证明已经获取完
##            self.write2sql(comments)
            if len(comments) < 100 :   #当limits设置为100时,默认每次服务器请求结果100条comments,当小于此数,意味爬到最后一页。
                print('评论已经获取完')
                break
            self.page +=1

# 找出post的两个参数params和encSecKey
class WangYiYun():

    def __init__(self):
        # 在网易云获取的三个参数

        self.second_param = '010001'
        self.third_param = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
        self.fourth_param = '0CoJUm6Qyw8W8jud'

    def create_random_16(self):
        '''获取随机十六个字母拼接成的字符串'''
        return (''.join(map(lambda xx: (hex(ord(xx))[2:]), str(os.urandom(16)))))[0:16]

    def aesEncrypt(self, text, key):

        # 偏移量
        iv = '0102030405060708'
        # 文本

        pad = 16 - len(text) % 16
        text = text + pad * chr(pad)  #补齐文本长度


        encryptor = AES.new(bytearray(key,'utf-8'), AES.MODE_CBC, bytearray(iv,'utf-8'))

       # encryptor = AES.new(key, 2, iv)

        ciphertext = encryptor.encrypt(bytearray(text,'utf-8'))
##        print(bytearray(key,'utf-8'))
        ciphertext = base64.b64encode(ciphertext)
        return ciphertext

    def get_params(self,text,page):
        '''获取网易云第一个参数'''
        # 第一个参数
        if page == 1:
            self.first_param = '{rid: "", offset: "0", total: "true", limit: "100", csrf_token: ""}'
            #rid: "R_SO_4_557581284",经测试该值可以置空,不影响结果的执行。
        else:
            self.first_param = '{rid: "", offset:%s, total: "false", limit: "100", csrf_token: ""}'%str((page-1)*20)  #limit参数可以灵活设置,默认为20,设置为100,爬取效率可以提高


        params = self.aesEncrypt(self.first_param, self.fourth_param).decode('utf-8')
        params = self.aesEncrypt(params, text)

        return params

    def rsaEncrypt(self, pubKey, text, modulus):
        '''进行rsa加密'''
        text = text[::-1]
        rs = int(codecs.encode(text.encode('utf-8'), 'hex_codec'), 16) ** int(pubKey, 16) % int(modulus, 16)
        return format(rs, 'x').zfill(256)

    def get_encSEcKey(self,text):
        '''获取第二个参数'''
        pubKey = self.second_param
        moudulus = self.third_param
        encSecKey = self.rsaEncrypt(pubKey, text, moudulus)
        return encSecKey

def main():
    idPs = ['557581284','32019002']   #花粥《纸短情长》以及Zedd / Jon Bellion的《beautiful now》,可根据需要在网易云音乐查找歌曲ID后替换,列表元素越多,爬取的循环次数越多
    for jj in range(len(idPs)):
        idNum = idPs[jj]
        spider = Spider(idNum)  #根据Spider类实例化spider对象
        spider.run(idNum) #调用spider对象的run方法

if __name__ == '__main__':

    main()