爬取某网站写的python代码-编程学习网

代码如下：

import requestsfrom pyquery import PyQueryimport reimport osimport csvimport datetime"""    说明：该代码是专门为爬取http://www.kgtmall.com.cn/商品而设计的。    使用方法：        1、在本地提前安装好python3的环境；        2、直接运行本代码；        3、运行本代码完后，会在当前目录生成一个result.csv文件，该文件里面就存了爬取该站点的商品信息    注意事项：在本代码运行期间，不能打开result.csv文件，因为这样程序就写不进去数据了；只能等本代码            全部运行结束后，才能打开esult.csv文件进行查看。    """def get_html_text(url):    """    获取首页源代码    :param url:    :return:    """    r = requests.get(url)    return r.textdef get_one_level_class(home_url):    """    一级        母婴用品 http://www.kgtmall.com.cn/mall/list.php?catid=4        生活家居 http://www.kgtmall.com.cn/mall/list.php?catid=5    """    html = get_html_text(home_url)    jpy = PyQuery(html)    items = jpy('.menu_title a')    for line in items:        jpy = PyQuery(line)        one_level_url = jpy('a').attr('href')        one_level_title = jpy('a').text()        yield one_level_url, one_level_titledef get_two_level_class(home_url):    """    二级        母婴用品 营养辅食 http://www.kgtmall.com.cn/mall/search.php?catid=539        母婴用品 妈妈专区 http://www.kgtmall.com.cn/mall/search.php?catid=544        母婴用品 婴儿保健 http://www.kgtmall.com.cn/mall/search.php?catid=887    """    for one_level_url, one_level_title in get_one_level_class(home_url):        jpy = PyQuery(one_level_url)        items = jpy('.selector_category li')        for line in items:            jpy = PyQuery(line)            two_level_url = jpy('a').attr('href')            two_level_title = jpy('a').text()            yield one_level_title, two_level_title, two_level_urldef get_pages(url):    """    获取页数    :return:    """    jpy = PyQuery(url)    pages = jpy('.pagination cite').text()    print('原pages：', pages)    try:        pages = int(re.findall('共.*?条/(.*)页', pages)[0])    except Exception as e:        print(e)        pages = 1    print('页码：', pages)    return pagesdef get_three_level_class(home_url):    """    三级        母婴用品 营养辅食 DHA http://www.kgtmall.com.cn/mall/search.php?catid=548        母婴用品 营养辅食 益生菌/初乳 http://www.kgtmall.com.cn/mall/search.php?catid=549        母婴用品 营养辅食 清火/开胃/驱虫 http://www.kgtmall.com.cn/mall/search.php?catid=550    """    for one_level_title, two_level_title, two_level_url in get_two_level_class(home_url):        jpy = PyQuery(two_level_url)        items = jpy('.selector_category li')        for line in items:            jpy = PyQuery(line)            three_level_title = jpy('a').text()            three_level_url = jpy('a').attr('href')            catid = re.findall('http://www.kgtmall.com.cn/mall/search.php\?catid=(.*)', three_level_url)[0]            pages = get_pages(three_level_url)            # for index in range(1, 3):            for index in range(1, pages + 1):                three_level_url_by_xiaoliang = 'http://www.kgtmall.com.cn/mall/search.php?kw=&list=0&catid={}&order=10&minprice=&maxprice=&page={}'.format(                catid, index)                yield one_level_title, two_level_title, three_level_title, three_level_url_by_xiaoliangdef shop_title_and_url(home_url):    """    商品和url        母婴用品 营养辅食 DHA 澳洲直邮 澳大利亚RIFOLD 儿童DHA90粒（一月以上适用） http://www.kgtmall.com.cn/mall/show.php?itemid=28089        母婴用品 营养辅食 益生菌/初乳 澳大利亚 Maxigenes美可卓 全脂高钙奶粉（蓝胖子）1kg 两罐装 http://www.kgtmall.com.cn/mall/show.php?itemid=23486    """    for one_level_title, two_level_title, three_level_title, three_level_url_by_xiaoliang in get_three_level_class(home_url):        jpy = PyQuery(three_level_url_by_xiaoliang)        items = jpy('.list_img a')        for line in items:            jpy = PyQuery(line)            shop_url = jpy('a').attr('href')            shop_title = jpy('a img').attr('alt')            yield one_level_title, two_level_title, three_level_title, shop_title, shop_urldef get_shop_info(home_url, count):    for one_level_title, two_level_title, three_level_title, shop_title, shop_url in shop_title_and_url(home_url):        print('--排错：' + one_level_title, two_level_title, three_level_title, shop_title, shop_url)        jpy = PyQuery(shop_url)        price = jpy('.price').text()        # 条形码        bar_code = jpy('.bar_code dl dd p').text()        goods_detail = jpy('#content')        try:            guige = re.findall('规格：(.*)', goods_detail.text())[0]        except:            guige = '没有规格'        try:            chandi = re.findall('产地：(.*)', goods_detail.text())[0]        except:            chandi = '没有产地'        print(count, one_level_title, two_level_title, three_level_title, shop_title,  bar_code, chandi, guige,  price, shop_url)        row = ([one_level_title, two_level_title, three_level_title, shop_title,  bar_code, chandi, guige,  price, shop_url])        ppath = os.path.dirname(__file__)        csv_file = ppath + '/result.csv'        # newline是为了解决csv文件里面有多余的空行，encoding是为了解决写不进csv数据报字符集的报错        with open(csv_file, 'a', newline='', encoding='utf-8') as f:            writer = csv.writer(f)            writer.writerow(row)        count += 1def main():    # 记录一下开始时间    start_time = datetime.datetime.now()    home_url = 'http://www.kgtmall.com.cn/'    # 当前代码路径    ppath = os.path.dirname(__file__)    csv_file = ppath + '/result.csv'    headers = (['一级分类', '二级分类', '三级分类', '商品名称', '条码', '产地', '规格', '价格', '商品链接'])    # newline是为了解决csv文件里面有多余的空行，encoding是为了解决写不进csv数据报字符集的报错    with open(csv_file, 'w', newline='', encoding='utf-8') as f:        writer = csv.writer(f)        writer.writerow(headers)    count = 1    get_shop_info(home_url, 1)    # 记录一下结束时间    end_time = datetime.datetime.now()    # 记录程序执行用时    timediff = end_time - start_time    print('总共用时{}秒\n'.format(str(timediff.seconds)))    print('全部商品已经按需求完成！！！')if __name__ == '__main__':    main()

运行后，会在当前目录下生成个result.csv文件，内容如下：

爬取某网站写的python代码

文章详情

爬取某网站写的python代码

软考中级精品资料免费领

相关文章

猜你喜欢

爬取某网站写的python代码

Python爬虫爬取美剧网站的实现代码

python怎么爬取某网站图片

用python爬取某个图片网站的图片

python爬取某网站原图作为壁纸

Python爬虫实战之用selenium爬取某旅游网站

python爬取网站数据（含代码和讲解）

python如何爬取某网站原图作为壁纸

Python轻松爬取写真网站全部图片

Python轻松爬取写真网站全部图片

Node.js实现爬取网站图片的示例代码

怎么用Python爬取某图网的图片

R语言怎样抓取某网站JSON数据的代码

python 爬取壁纸网站的示例

Python爬取网页的所有内外链的代码

利用Python网络爬虫爬取各大音乐评论的代码

Python爬虫之爬取最新更新的小说网站

利用Python爬虫爬取网站音乐遇到的坑

python爬取新闻门户网站的示例

利用 Python 爬取网站的新手指南