爬取一个网站的国家数据

# coding: utf-8

import requests
import json
import time
import sys
reload(sys)
sys.setdefaultencoding('utf8')

def go():
    countryUrl = 'https://www.shein.com/user/addressbook/getCountry?get_all_country=1';
    try:
        result = ''
        countryResult = requests.get(countryUrl, headers = headers, timeout = 30)
        countryContent = countryResult.content
        # 解析json数据
        res = json.loads(countryContent)
        itemCates = res['info']['country']['item_cates']
        for item in itemCates:
            print('country: ' + item['country'])
            # 爬取省份
            provinceResult = getRegion(item['id'], 2)
            if provinceResult:
                for province in provinceResult:
                    print('province: ' + province['name'])
                    #爬取城市
                    cityResult = getRegion(province['id'], 3)
                    if cityResult:
                        for city in cityResult:
                            print('city: ' + city['name'])
                            result += item['country'] + ',' + province['name'] + ',' + city['name'] + '\r\n'
                        # print(cityResult)
                    else:
                        result += item['country'] + ',' + province['name'] + '\r\n'
                # print(provinceResult)
            else:
                result += item['country'] + '\r\n'
            # print(result)
        # print("Hello, World!")
        # 把最终结果写进文件
        fp = open('regions.csv', 'wb')
        fp.write(result)
        fp.close()
        return True
    except BaseException, Argument:
        print(Argument)
        return False
        
def getRegion(parentId, type):
    regionUrl = 'https://www.shein.com/user/addressbook/getAddressById?parentId=' + str(parentId) + '&type=' + str(type)
    try:
        regionResult = requests.get(regionUrl, headers = headers, timeout = 30)
        regionContent = regionResult.content
        # 解析json数据
        res = json.loads(regionContent)
        if res['info'] != 'null':
            return res['info']
        else:
            return False
    except BaseException, Argument:
        # print(Argument)
        return False

if __name__ == '__main__':
    # 运行这个脚本需要先在网站登陆,在请求中复制cookie到这里来,不然它会重定向到登陆界面去了
    cookie = '__cfduid=d0b17216508100742efc6518a2b942ce81572327714; __cfruid=a65f54d881966590c1ffd6fee35d3f2055b1d42b-1572327714; _ga=GA1.2.1528734215.1572327716; _gid=GA1.2.2043972640.1572327716; cto_lwid=b78bcaf7-5d01-4bbc-9ad5-391b38498775; scarab.visitor=%22A9E12A745E19342%22; _fbp=fb.1.1572327720384.18188987; no_pop_up_us=1; location=en%7CMU; jump_to_us=0; cookieId=502CFF14_EAD8_1644_B239_0309DC7B58CC; cate_active_name=0; cate_channel_type=2; default_currency=USD; G_ENABLED_IDPS=google; countryId=226; country=US; optimizelyEndUserId=oeu1572327772849r0.26557346501199586; _aimtellSubscriberID=d574ef90-80e1-6159-09db-fbfefae3cee2; have_show=1; cancelPrivacy=1; hideCoupon=1; _aimtellSessionPageViews=4; scarab.mayAdd=%5B%7B%22i%22%3A%22865824%22%7D%5D; showInch=1; scarab.profile=%22865824%7C1572327916%22; fita.sid.shein=fYqj_Uq1yJuPYUMBA2y4LMF4Jbu3LUqd; __atuvc=1%7C44; G_AUTHUSER_H=1; userinfo_email=missfizzone%40gmail.com; userinfo_userId=100992942; memberId=100992942; origin_type=; originId=; rw_gap_id=7eae5ce63abde40fb59c220e50ab3ffb; sw_pos_checkout_frequency_total=3; sw_pos_checkout_frequency=0; bi_session_id=bi_1572333117064_78469; country_tag_outdated=United_States; abt-info=ccc_shein_pc_topbanner~303~3790~default~A%7Cshein_pc_category_RecommendationsForYou~289~2637~matcgroup~emarsys_personal%7CSPcAod~408~2962~default~; pt_s_79a25132=vt=1572333124572&cad=; pt_79a25132=uid=NYiPKaqOu5v30JslFo3Wew&nid=0&vid=V48fyMO6EhhErHuynhRuCQ&vn=3&pvn=1&sact=1572333126581&to_flag=0&pl=DRBXJfzQWzNhj3z5XMRWtw*pt*1572333124572; sessionID_shein=s%3ALY1aPa-296hCpsdhECp2twe7xzP2JRtK.6SENTO0NsRUSSnnbOjWY79sUiHZomM1YuWddC5FD%2Bi4'
    headers = {'user-agent': 'my-app/0.0.1', 'cookie': cookie}
    go()


(58) 给我点赞
留言(155)
匿名:哎呦不错哦!
匿名:哎呦不错哦!