首页>代理ip知识与免费资源>正文
Python代理IP爬虫的使用
发布日期:2019/6/7 19:18:16    阅读量:9840

       Python爬虫要经历爬虫、爬虫被限制、爬虫反限制的过程。当然后续还要网页爬虫限制优化,爬虫再反限制的一系列道高一尺魔高一丈的过程。爬虫的初级阶段,添加headersip代理可以解决很多问题。

 

本人自己在爬取豆瓣读书的时候,就以为爬取次数过多,直接被封了IP.后来就研究了代理IP的问题.

 

(当时不知道什么情况,差点心态就崩了...),下面给大家介绍一下我自己代理IP爬取数据的问题,请大家指出不足之处

 

问题:

 

这是我的IP被封了,一开始好好的,我还以为是我的代码问题了;

 

从网上查找了一些关于爬虫代理IP的资料,得到下面的思路;

 

爬取一些IP,过滤掉不可用.

requests的请求的proxies参数加入对应的IP.

继续爬取.

收工

好吧,都是废话,理论大家都懂,上面直接上代码...

思路有了,动手起来.

 

运行环境

Python 3.7, Pycharm

 

这些需要大家直接去搭建好环境...

 

准备工作

爬取IP地址的网站(国内高匿代理)

校验IP地址的网站

你之前被封IPpy爬虫脚本...

上面的网址看个人的情况来选取

 

爬取IP的完整代码

PS:简单的使用bs4获取IP和端口号,没有啥难度,里面增加了一个过滤不可用IP的逻辑

 

关键地方都有注释了

 

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

# @Time    : 2018/11/22

# @Author  : liangk

# @Site    :

# @File    : auto_archive_ios.py

# @Software: PyCharm

 

 

import requests

from bs4 import BeautifulSoup

import json

 

 

class GetIp(object):

    """抓取代理IP"""

 

    def __init__(self):

        """初始化变量"""

        self.url = 'http://www.xicidaili.com/nn/'

        self.check_url = 'https://www.ip.cn/'

        self.ip_list = []

 

    @staticmethod

    def get_html(url):

        """请求html页面信息"""

        header = {

            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'

        }

        try:

            request = requests.get(url=url, headers=header)

            request.encoding = 'utf-8'

            html = request.text

            return html

        except Exception as e:

            return ''

 

    def get_available_ip(self, ip_address, ip_port):

        """检测IP地址是否可用"""

        header = {

            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'

        }

        ip_url_next = '://' + ip_address + ':' + ip_port

        proxies = {'http': 'http' + ip_url_next, 'https': 'https' + ip_url_next}

        try:

            r = requests.get(self.check_url, headers=header, proxies=proxies, timeout=3)

            html = r.text

        except:

            print('fail-%s' % ip_address)

        else:

            print('success-%s' % ip_address)

            soup = BeautifulSoup(html, 'lxml')

            div = soup.find(class_='well')

            if div:

                print(div.text)

            ip_info = {'address': ip_address, 'port': ip_port}

            self.ip_list.append(ip_info)

 

    def main(self):

        """主方法"""

        web_html = self.get_html(self.url)

        soup = BeautifulSoup(web_html, 'lxml')

        ip_list = soup.find(id='ip_list').find_all('tr')

        for ip_info in ip_list:

            td_list = ip_info.find_all('td')

            if len(td_list) > 0:

                ip_address = td_list[1].text

                ip_port = td_list[2].text

                # 检测IP地址是否有效

                self.get_available_ip(ip_address, ip_port)

        # 写入有效文件

        with open('ip.txt', 'w') as file:

            json.dump(self.ip_list, file)

        print(self.ip_list)

 

 

# 程序主入口

if __name__ == '__main__':

    get_ip = GetIp()

    get_ip.main()

使用方法完整代码

PS: 主要是通过使用随机的IP来爬取,根据request_status来判断这个IP是否可以用.

 

为什么要这样判断?

 

主要是虽然上面经过了过滤,但是不代表在你爬取的时候是可以用的,所以还是得多做一个判断.

 

#!/usr/bin/env python3

# -*- coding: utf-8 -*-

# @Time    : 2018/11/22

# @Author  : liangk

# @Site    :

# @File    : get_douban_books.py

# @Software: PyCharm

 

from bs4 import BeautifulSoup

import datetime

import requests

import json

import random

 

ip_random = -1

article_tag_list = []

article_type_list = []

 

 

def get_html(url):

    header = {

        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36'

    }

    global ip_random

    ip_rand, proxies = get_proxie(ip_random)

    print(proxies)

    try:

        request = requests.get(url=url, headers=header, proxies=proxies, timeout=3)

    except:

        request_status = 500

    else:

        request_status = request.status_code

    print(request_status)

    while request_status != 200:

        ip_random = -1

        ip_rand, proxies = get_proxie(ip_random)

        print(proxies)

        try:

            request = requests.get(url=url, headers=header, proxies=proxies, timeout=3)

        except:

            request_status = 500

        else:

            request_status = request.status_code

        print(request_status)

    ip_random = ip_rand

    request.encoding = 'gbk'

    html = request.content

    print(html)

    return html

 

 

def get_proxie(random_number):

    with open('ip.txt', 'r') as file:

        ip_list = json.load(file)

        if random_number == -1:

            random_number = random.randint(0, len(ip_list) - 1)

        ip_info = ip_list[random_number]

        ip_url_next = '://' + ip_info['address'] + ':' + ip_info['port']

        proxies = {'http': 'http' + ip_url_next, 'https': 'https' + ip_url_next}

        return random_number, proxies

 

 

# 程序主入口

if __name__ == '__main__':

    """只是爬取了书籍的第一页,按照评价排序"""

    start_time = datetime.datetime.now()

    url = 'https://book.douban.com/tag/?view=type&icn=index-sorttags-all'

    base_url = 'https://book.douban.com/tag/'

    html = get_html(url)

    soup = BeautifulSoup(html, 'lxml')

    article_tag_list = soup.find_all(class_='tag-content-wrapper')

    tagCol_list = soup.find_all(class_='tagCol')

 

    for table in tagCol_list:

        """ 整理分析数据 """

        sub_type_list = []

        a = table.find_all('a')

        for book_type in a:

            sub_type_list.append(book_type.text)

        article_type_list.append(sub_type_list)

 

    for sub in article_type_list:

        for sub1 in sub:

            title = '==============' + sub1 + '=============='

            print(title)

            print(base_url + sub1 + '?start=0' + '&type=S')

            with open('book.text', 'a', encoding='utf-8') as f:

                f.write('\n' + title + '\n')

                f.write(url + '\n')

            for start in range(0, 2):

                # (start * 20) 分页是0 20  40 这样的

                # type=S是按评价排序

                url = base_url + sub1 + '?start=%s' % (start * 20) + '&type=S'

                html = get_html(url)

                soup = BeautifulSoup(html, 'lxml')

                li = soup.find_all(class_='subject-item')

                for div in li:

                    info = div.find(class_='info').find('a')

                    img = div.find(class_='pic').find('img')

                    content = '书名:<%s>' % info['title'] + '  书本图片:' + img['src'] + '\n'

                    print(content)

                    with open('book.text', 'a', encoding='utf-8') as f:

                        f.write(content)

 

    end_time = datetime.datetime.now()

    print('耗时: ', (end_time - start_time).seconds)

为什么选择国内高匿代理,因为使用这样简单的代理IP,基本上就可以应付在爬爬爬着被封IP的情况了.而且没有使用自己的IP,间接的保护。