urllib

基本使用方法

pip install urllib

  • 测试
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    import urllib.request

    # 定义访问的url
    url = 'http://www.baidu.com'

    # 模拟浏览器发请求
    response = urllib.request.urlopen(url)

    # 获取状态码判断请求是否成功
    # print(response.getcode()) #200

    # print(response) # <http.client.HTTPResponse object at 0x0000017DC4629A30>
    # 获取相应源码 read方法返回二进制 需要解码
    # content = response.read().decode('utf-8')
    # 按行读取
    content = response.readlines()
    print(content) # 输出网站源码
  • 但是遇到https的网站还是会报错,加上UA就能正常读取了
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    import urllib.request
    import urllib.parse

    url_page = 'https://www.baidu.com/s?'

    # 模拟用户请求
    headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    }

    # 将中文变成unicode编码
    data = {
    'wd': '周杰伦',
    'gender': '男',
    'location': '中国台湾'
    }
    params = urllib.parse.urlencode(data)

    # 请求对象的创建 添加user-agent
    request = urllib.request.Request(url=url_page + params, headers=headers)
    response = urllib.request.urlopen(request)
    content = response.read().decode('utf-8')

    print(content) #输出网站源码
  • 保存图片、音频及视频文件
    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    import urllib.request

    # 定义访问的url
    # url_page = 'http://www.baidu.com'
    url_img = 'https://img1.baidu.com/it/u=2835220188,4227150300&fm=253&fmt=auto&app=138&f=JPEG?w=500&h=585'
    #url_video = 'http://www.baidu.com'
    # 下载资源
    # urllib.request.urlretrieve(url_img, './raw/baidu.html')

    # 图片资源,第二个参数是文件路径
    urllib.request.urlretrieve(url_img, './raw/lisa.jpg')
  • 1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    import urllib.request
    import random

    base_url = 'http://baidu.com/s?wd=ip'
    # 封装请求

    header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
    'Cookie': '__qc_wId=865; JSESSIONID=D06865AF09CD9D619CA1930C9EC055A6'
    }

    request = urllib.request.Request(url=base_url, headers=header)

    # 代理ip池
    proxies_pool = [
    {'http': '120.24.76.81:8123'},
    {'http': '120.24.76.81:8124'}
    ]

    # 随机从代理池中取代理ip
    proxies = random.choice(proxies_pool)

    # handler bulider_opener open 使用这三个替换urlopen可获得更多的操作

    handler = urllib.request.ProxyHandler(proxies=proxies)
    opener = urllib.request.build_opener(handler)
    response = opener.open(request)

    content = response.read().decode('utf-8')
    print(content)

    综合练习(百度翻译)

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    import urllib.request
    import urllib.parse
    import json

    base_url = 'https://fanyi.baidu.com/sug' # 简单翻译
    # 详细翻译
    # base_url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh'

    headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
    }
    while True:
    english = input('请输入要翻译的单词')

    data = {
    'kw': english
    }
    data = urllib.parse.urlencode(data).encode('utf-8')

    # data 为接口的表单数据 用于post请求 必须编码 get请求需要与url拼接 无需编码
    request = urllib.request.Request(base_url, data, headers)

    response = urllib.request.urlopen(request)

    content = response.read().decode('utf-8')

    # print(content)

    result = json.loads(content)
    # print(result)
    if len(result['data']) > 0:
    print(result['data'][0]['v'])
    else:
    print('i donot know')

    Xpath

    获取网站源码,但是也许只需要一类div或者img等,下载xpath进行筛选
    pip install lxml

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    from lxml import etree
    import urllib.request
    import random

    # xpath 解析 本地文件解析etree.parse 服务器文件etree.HTML

    base_url = 'https://codert.cn/'

    header = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    }

    request = urllib.request.Request(url=base_url, headers=header)
    handler = urllib.request.HTTPHandler()

    opener = urllib.request.build_opener(handler)

    response = opener.open(request)
    content = response.read().decode('utf-8')
    # print(content)
    # xpath基本语法
    '''
    1. 路径查询
    //:查找所有的子孙节点不考虑层级关系
    /:直接查找子节点
    2. 谓词查询
    //div[@id]
    //div[@id="management"]
    3. 熟悉查询
    //@class
    4.模糊查询
    //div[contains("@id","he")]
    //div[start_with("@id","he")]
    5. 内容查询
    //div/h1/text()
    6. 逻辑运算
    //div[@id='head' and @class='s_down']
    //title | //price
    '''

    tree = etree.HTML(content)
    result_list = tree.xpath('//img/@data-lazy-src') #获取网站懒加载图片地址
    # print(len(result_list))
    for temp in result_list:
    try:
    print(temp)
    urllib.request.urlretrieve(temp,'./raw/douban/condert'+str(random.randint(1, 10000))+'.jpg')
    except:
    print('url is fault')

    requests

    下载 pip install requests

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    import requests
    from lxml import etree

    '''
    request属性及类型:
    r.text : 获取网站源码
    r.encode : 访问或定制编码方式
    r.url : 获取响应的url
    r.content: 响应的字节类型
    r.status_code : 响应的状态码
    r.header : 响应的头信息
    '''
    url = 'https://wish.zhangweishihundan.com/'

    response = requests.get(url)

    response.encoding = 'utf-8'
    content = response.text

    tree = etree.HTML(content)
    result_list = tree.xpath('//div[@class="sbody"]/text()')

    # 保存在文件中
    with open('./raw/zhangwei/wish.txt', 'w', encoding='utf-8') as fp:
    for temp in result_list:
    fp.write(temp + '\n')
  • 1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    import requests

    url = 'https://www.baidu.com/s'

    header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    }

    data = {
    'wd': 'lisa'
    }
    # 参数值不需要进行编码
    response = requests.get(url=url, headers=header, params=data)
    response.encoding = 'utf-8'
    content = response.text
    print(content) # 输出网站源码
  • 1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    import requests
    import json

    url = 'https://fanyi.baidu.com/sug'

    header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    }

    data = {
    'kw': 'love'
    }
    # 参数值不同get请求参数为params post请求data
    response = requests.post(url=url,data=data,headers=header)
    response.encoding = 'utf-8'
    content = response.text

    content = json.loads(content)
    print(content['data']) # 输出love的翻译

    scrapy

  • 在终端输入 scrapy startproject 项目名称 创建新项目

    • 在spiders文件夹下创建爬虫文件

      在终端进入项目的spiders文件夹下scrapy genspider 文件命 网站

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      import scrapy

      class BaiduSpider(scrapy.Spider):
      # 爬虫的名字 用于运行爬虫时使用的值
      name = 'baidu'

      allowed_domains = ['www.baidu.com']
      start_urls = ['http://www.baidu.com/']

      # 执行start_urls之后 response 就是返回的那个对象
      # response 相当于 urllibr.request.urlopen()
      def parse(self, response):
      pass

    • 运行爬虫

      在终端进入项目的spiders文件夹下scrapy crawl 爬虫名字

      案例

      获取影响联盟英雄皮肤图片

      1
      2
      3
      4
      5
      6
      7
      8
      9
      10
      11
      12
      13
      14
      15
      16
      17
      18
      19
      20
      21
      22
      23
      24
      25
      26
      27
      28
      29
      30
      31
      32
      33
      34
      35
      36
      37
      38
      39
      40
      41
      42
      43
      44
      45
      46
      47
      48
      49
      50
      51
      52
      53
      54
      55
      56
      57
      58
      59
      60
      61
      62
      63
      64
      65
      66
      67
      68
      69
      70
      71
      72
      73
      74
      """
      Date: 2022.11.7
      Author: panther
      Language: python3
      """
      import requests
      import re
      import json
      import os


      def getHtml(url):
      try:
      r = requests.get(url)
      r.raise_for_status()
      r.encoding = r.apparent_encoding
      except:
      print(url + "爬取失败!")
      else:
      response = r.text
      getInfo(response)


      def getInfo(res):
      lists = re.findall(r'"keys":(.*?),"data"', res)
      # print(lists)
      hero_id = json.loads(lists[0])
      # print(hero_id)
      for hero in hero_id.values():
      getSkin(hero)


      def getSkin(hero):
      url = 'https://lol.qq.com/biz/hero/' + hero + '.js'
      try:
      r = requests.get(url)
      r.raise_for_status()
      r.encoding = r.apparent_encoding
      except:
      print(url + "爬取失败!")
      else:
      html = r.text
      num = re.findall(r'"id":"(\d{4,6})","num"', html)
      for i in range(len(num)):
      img_url = 'https://game.gtimg.cn/images/lol/act/img/skin/big' + num[i] + '.jpg'
      save_img(hero, img_url)


      def save_img(hero, img_url):
      root = hero + "\\"
      path = root + img_url.split('/')[-1]
      try:
      if not os.path.exists(root):
      os.mkdir(root)
      if not os.path.exists(path):
      r = requests.get(img_url)
      with open(path, 'wb') as f:
      f.write(r.content)
      f.close()
      print("文件保存成功!")
      else:
      print("文件已存在!")
      except:
      print("爬取失败!")
      print(img_url + "已下载")


      def main():
      url = "https://lol.qq.com/biz/hero/champion.js"
      getHtml(url)


      if __name__ == "__main__":
      main()