urllib サンプル

Forbiddenに対応させたかった

HTTPステータスコードが
403:Forbidden:閲覧禁止だった場合、
ユーザーエージェントを偽装してアクセスするサンプル

#!/usr/bin/env python
# encoding: utf-8
import sys
from urllib.request import Request, urlopen
from urllib.error import URLError, HTTPError


def main():
    url = "https://gihyo.jp/dp"
    # url = "http://not_exist_site/"
    res = throw_request(url)
    html = gen_html(res)
    save_file(html, "index.html")


def throw_request(url, req=None, recursive=False):
    if req is None:
        req = Request(url)
    try:
        res = urlopen(req)
    except HTTPError as e:
        print('raise HTTPError')
        print('StatusCode: ' + str(e.code))
        print('ErrorReason: ' + str(e.reason))
        if e.code == 403 and recursive == False:
            print("アクセスが禁止されています")
            print("ユーザーエージェントを偽装して再接続します")
            req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
            res = throw_request(url, req, True)  # 再帰呼び出し
            return res
        else:
            print('正常なレスポンスでないため、エラー終了します')
            sys.exit(1)
    except URLError as e:
        print('raise URLError')
        print('ErrorReason: ' + str(e.reason))
        print('正常なレスポンスでないため、エラー終了します')
        sys.exit(1)
    else:
        print("request was successful")
        print('StatusCode: ' + str(res.getcode()))
        return res


def gen_html(res):
    # f.read() の戻り値は bytes型
    # 文字列(str型)として扱うには、文字コードを指定してデコードする
    # res の charset でデコードして、保存する
    encoding = res.info().get_content_charset(failobj="utf-8")
    print("encoding: ", encoding)
    html = res.read().decode(encoding)
    return html


def save_file(text, file_name):
    with open(file_name, 'w') as f:
        f.write(text)
        print("save: " + file_name)


if __name__ == "__main__":
    main()

python urllib-sample4.py
raise HTTPError
StatusCode: 403
ErrorReason: Forbidden
アクセスが禁止されています
ユーザーエージェントを偽装して再接続します
request was successful
StatusCode: 200
encoding:  utf-8
save: index.html