自己动手做的 python 爬虫 GitHub 链接
1. 爬取 bilibili 每日排行榜数据
- 使用 XPath 爬取,并将数据保存到 csv 文件中
- 文件名使用该排行榜所在时间段
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
| import requests import csv import lxml.html
url = 'https://www.bilibili.com/ranking/' html = requests.get(url).content.decode()
selector = lxml.html.fromstring(html)
title = selector.xpath('//*[@id="app"]/div[1]/div/div[1]/div[2]/div[3]/ul/li/div[2]/div[2]/a/text()')
link = selector.xpath('//*[@id="app"]/div[1]/div/div[1]/div[2]/div[3]/ul/li/div[2]/div[1]/a/@href')
up_name = selector.xpath('//*[@id="app"]/div[1]/div/div[1]/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/span/text()')
up_videoplay = selector.xpath('//*[@id="app"]/div[1]/div/div[1]/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/span[1]/text()')
time = selector.xpath('//*[@id="app"]/div[1]/div/div[1]/div[2]/div[2]/div/span/text()') time_num = time[0] str1 = time_num.replace(' 的数据综合得分,每日更新一次','') str2 = str1.replace('统计所有投稿在 ','') time_num2 = str2
headers = ['up_name','title','link'] rows = [] for i in range(100): rows.append([up_name[i],title[i],link[i]])
with open(f'{time_num2}.csv','w',encoding='utf-8') as f: f_csv = csv.writer(f) f_csv.writerow(headers) f_csv.writerows(rows)
|
- csv 部分展示
2020年02月07日 - 2020年02月10日
2. 爬取 baidu 上搜到的图片(初级)
2.1 thumbURL
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32
| import re import requests import os
def download(html): pic_url = re.findall('"thumbURL":"(.*?)",',html, re.S) i = 1 for key in pic_url: print("开始下载图片:"+key +"\r\n") try: pic = requests.get(key, timeout=10) except requests.exceptions.ConnectionError: print('图片无法下载') continue main_path="E:/baidu/" if not os.path.exists(main_path): os.makedirs(main_path) dir = "E:/baidu/" + str(i) + '.jpg' fp = open(dir, 'wb') fp.write(pic.content) fp.close() i += 1 def main(): url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=result&pos=history&word=siyueshinide' result = requests.get(url) download(result.text)
if __name__ == '__main__': main()
|
2.2 objURL
_分辨率较高,但有的图爬不了_
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
| import re import requests import os import json
def download(html): pic_url = re.findall('"objURL":"(.*?)",',html, re.S) i = 1 for key in pic_url: print("开始下载图片:"+key +"\r\n") try: pic = requests.get(key, timeout=10) except requests.exceptions.ConnectionError: print('图片无法下载') continue except requests.exceptions.ReadTimeout: print('requests.exceptions.ReadTimeout') continue main_path="E:/baidu/" if not os.path.exists(main_path): os.makedirs(main_path) dir = "E:/baidu/" + str(i) + '.jpg' fp = open(dir, 'wb') fp.write(pic.content) fp.close() i += 1 def main(): url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=result&pos=history&word=siyueshinide' result = requests.get(url) download(result.text)
if __name__ == '__main__': main()
|
2.3 baidu 面向对象
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
| import requests import re import os
def get_id(search_id): url = 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=' + search_id return url
def get_obj(): url = get_id(search_id) html = requests.get(url).content.decode() obj_URL = re.findall('"objURL":"(.*?)",',html,re.S) return obj_URL
def save_pic(): obj_url = get_obj() i = 1 for objurl in obj_url: print('开始下载图片'+'\t'+'第'+str(i)+'张') try : pic = requests.get(objurl,timeout = 10) except requests.exceptions.ConnectionError: print('图片无法下载') continue except requests.exceptions.ReadTimeout: print('requests.exceptions.ReadTimeout') continue global search_id main_path = r'E:\learn\py\git\spider\spider_learn\baidu\pic\\' + search_id +'\\' if not os.path.exists(main_path): os.makedirs(main_path) dir = "E:\learn\py\git\spider\spider_learn\\baidu\pic\\" +search_id +'\\'+ search_id+ str(i) + '.jpg' with open(dir,'wb') as f: f.write(pic.content) i += 1
if __name__ =='__main__': search_id = input('请输入要下载的内容:') save_pic()
|
2.4 baidu_more
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
| import requests import re import os from multiprocessing.dummy import Pool
def get_urls(search_id): total = (input('请输入要几页----30张一页----:')) url = 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=' + search_id+ '&pn=' t = 0 URLS = [] while t < int(total)*30: URL = url + str(t) t = t + 30 URLS.append(URL) return URLS
def get_obj(url): html = requests.get(url).content.decode() obj_URL = re.findall('"objURL":"(.*?)",',html,re.S) return obj_URL
def save_pic(): pool=Pool(5) objurls = pool.map(get_obj,URLS) i = 1 for objurl in objurls: for obj in objurl: print('开始下载图片'+'\t'+'第'+str(i)+'张') try : pic = requests.get(obj,timeout = 10) except requests.exceptions.ConnectionError: print('图片无法下载') continue except requests.exceptions.ReadTimeout: print('requests.exceptions.ReadTimeout') continue global search_id main_path = patha +'\\' + search_id +'\\' if not os.path.exists(main_path): os.makedirs(main_path) dir = main_path + search_id+ str(i) + '.jpg' with open(dir,'wb') as f: f.write(pic.content) i += 1
if __name__ =='__main__': search_id = input('请输入要下载的内容:') URLS = get_urls(search_id) patha = input('输入文件保存路径----示例:E:\\baidu----:') save_pic()
|
3. 爬取 ins 上的图片(初级版)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
| import requests import json import lxml.html import re import os
def get_src(): url = 'https://www.instagram.com/baaaakuuuu' html = requests.get(url).content.decode() selector = lxml.html.fromstring(html) script = selector.xpath('/html/body/script[1]/text()')[0].strip() src = re.findall(r'"thumbnail_resources":\[(.*?)\]',script,re.S) return src
def get_picurl(): src = get_src() pic_url_lst = [] for src_ls in src : thumb = re.findall(r'"config_height":480},{(.*?),"config_width":640,"config_height":640}',src_ls)[0] thumb_json = '{' + thumb + '}' thumb_py = json.loads(thumb_json) pic_url = thumb_py['src'] pic_url_lst.append(pic_url) return pic_url_lst
def save_pic(): pic_url_lst = get_picurl() i = 1 for pic_con in pic_url_lst: try: pic = requests.get(pic_con, timeout=10) main_path = 'E:/ins/' if not os.path.exists(main_path): os.makedirs(main_path) path = 'E:/ins/' + 'baku' + str(i) + '.jpg' with open(path,'wb') as f: f.write(pic.content) print(f'第{i}张已下载') i +=1 except requests.exceptions.ConnectionError: print('图片无法下载') continue return
save_pic()
|
4. 爬取 Wallhaven 上的图片
4.1 龟速爬取,只是用来爬了一下博客需要的图片 hhh
_爬取速度慢,要等半天才能开始保存文件,应该是我代码结构的问题,以后再做优化_
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
| import lxml.html import requests import re import os
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36"}
def get_url(): pages = input('输入页数:') url_pics = [] page = 1 while page <= int(pages): url = 'https://wallhaven.cc/search?categories=010&purity=100&resolutions=1280x800&sorting=relevance&order=desc&page=' + str(page) html = requests.get(url,headers = headers).content.decode() selector = lxml.html.fromstring(html)
url_pic = selector.xpath('//*[@id="thumbs"]/section/ul/li/figure/a/@href') url_pics.append(url_pic) page += 1 print('得到了内层url') return url_pics
def get_pic(): url_pics = get_url() img_urls = [] for urlst in url_pics: for url in urlst: htmlp = requests.get(url,headers = headers).content.decode() img_url = re.findall(r'"wallpaper" src="(.*?)"',htmlp,re.S)[0] img_urls.append(img_url) print('得到图片的url') return img_urls
def get_img(imgurl_list): i = 1 for url in imgurl_list: print('开始下载图片'+'\t'+'第'+str(i)+'张') try: pic = requests.get(url, timeout=10) except requests.exceptions.ConnectionError: print('图片无法下载') continue except requests.exceptions.ReadTimeout: print('requests.exceptions.ReadTimeout') continue
main_path = r'E:\\wallhaven\\' if not os.path.exists(main_path): os.makedirs(main_path)
dir = 'E:\\wallhaven\\' + str(i) +'.jpg'
with open(dir, 'wb') as f: f.write(pic.content) i += 1
if __name__ == '__main__': imgurl_list = get_pic() get_img(imgurl_list)
|
4.2 多线程爬取