自己动手做的 python 爬虫   GitHub 链接
1. 爬取 bilibili 每日排行榜数据
- 使用 XPath 爬取,并将数据保存到 csv 文件中
- 文件名使用该排行榜所在时间段
| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 
 | import requestsimport csv
 import lxml.html
 
 url = 'https://www.bilibili.com/ranking/'
 html = requests.get(url).content.decode()
 
 
 selector = lxml.html.fromstring(html)
 
 title = selector.xpath('//*[@id="app"]/div[1]/div/div[1]/div[2]/div[3]/ul/li/div[2]/div[2]/a/text()')
 
 
 link = selector.xpath('//*[@id="app"]/div[1]/div/div[1]/div[2]/div[3]/ul/li/div[2]/div[1]/a/@href')
 
 
 
 
 up_name = selector.xpath('//*[@id="app"]/div[1]/div/div[1]/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/a/span/text()')
 
 up_videoplay = selector.xpath('//*[@id="app"]/div[1]/div/div[1]/div[2]/div[3]/ul/li/div[2]/div[2]/div[1]/span[1]/text()')
 
 time = selector.xpath('//*[@id="app"]/div[1]/div/div[1]/div[2]/div[2]/div/span/text()')
 time_num = time[0]
 str1 = time_num.replace(' 的数据综合得分,每日更新一次','')
 str2 = str1.replace('统计所有投稿在 ','')
 time_num2 = str2
 
 headers = ['up_name','title','link']
 rows = []
 for i in range(100):
 rows.append([up_name[i],title[i],link[i]])
 
 with open(f'{time_num2}.csv','w',encoding='utf-8') as f:
 f_csv = csv.writer(f)
 f_csv.writerow(headers)
 f_csv.writerows(rows)
 
 | 
- csv 部分展示
 2020年02月07日 - 2020年02月10日

2. 爬取 baidu 上搜到的图片(初级)
2.1 thumbURL
| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 
 | import reimport requests
 import os
 
 def download(html):
 
 pic_url = re.findall('"thumbURL":"(.*?)",',html, re.S)
 i = 1
 for key in pic_url:
 print("开始下载图片:"+key +"\r\n")
 try:
 pic = requests.get(key, timeout=10)
 except requests.exceptions.ConnectionError:
 print('图片无法下载')
 continue
 
 main_path="E:/baidu/"
 if  not os.path.exists(main_path):
 os.makedirs(main_path)
 dir = "E:/baidu/" + str(i) + '.jpg'
 fp = open(dir, 'wb')
 fp.write(pic.content)
 fp.close()
 i += 1
 def main():
 url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=result&pos=history&word=siyueshinide'
 result = requests.get(url)
 download(result.text)
 
 
 if __name__ == '__main__':
 main()
 
 | 
2.2 objURL
_分辨率较高,但有的图爬不了_
| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 
 | import reimport requests
 import os
 import json
 
 def download(html):
 
 pic_url = re.findall('"objURL":"(.*?)",',html, re.S)
 
 
 
 
 
 i = 1
 for key in pic_url:
 print("开始下载图片:"+key +"\r\n")
 try:
 pic = requests.get(key, timeout=10)
 except requests.exceptions.ConnectionError:
 print('图片无法下载')
 continue
 except requests.exceptions.ReadTimeout:
 print('requests.exceptions.ReadTimeout')
 continue
 
 main_path="E:/baidu/"
 if  not os.path.exists(main_path):
 os.makedirs(main_path)
 dir = "E:/baidu/" + str(i) + '.jpg'
 fp = open(dir, 'wb')
 fp.write(pic.content)
 fp.close()
 i += 1
 def main():
 url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&fm=result&pos=history&word=siyueshinide'
 result = requests.get(url)
 download(result.text)
 
 
 if __name__ == '__main__':
 main()
 
 | 
2.3 baidu 面向对象
| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 
 | import requestsimport re
 import os
 
 
 def get_id(search_id):
 url = 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=' + search_id
 return url
 
 def get_obj():
 url = get_id(search_id)
 html = requests.get(url).content.decode()
 obj_URL = re.findall('"objURL":"(.*?)",',html,re.S)
 return obj_URL
 
 def save_pic():
 obj_url = get_obj()
 i = 1
 for objurl in obj_url:
 print('开始下载图片'+'\t'+'第'+str(i)+'张')
 try :
 pic = requests.get(objurl,timeout = 10)
 except requests.exceptions.ConnectionError:
 print('图片无法下载')
 continue
 except requests.exceptions.ReadTimeout:
 print('requests.exceptions.ReadTimeout')
 continue
 global search_id
 main_path = r'E:\learn\py\git\spider\spider_learn\baidu\pic\\' + search_id +'\\'
 if not os.path.exists(main_path):
 os.makedirs(main_path)
 dir = "E:\learn\py\git\spider\spider_learn\\baidu\pic\\" +search_id +'\\'+ search_id+ str(i) + '.jpg'
 with open(dir,'wb') as f:
 f.write(pic.content)
 i += 1
 
 
 if __name__ =='__main__':
 search_id = input('请输入要下载的内容:')
 save_pic()
 
 | 
2.4 baidu_more
| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 
 | import requestsimport re
 import os
 from multiprocessing.dummy import Pool
 
 
 def get_urls(search_id):
 total = (input('请输入要几页----30张一页----:'))
 url = 'http://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=' + search_id+ '&pn='
 t = 0
 URLS = []
 while t < int(total)*30:
 URL = url + str(t)
 t = t + 30
 URLS.append(URL)
 return URLS
 
 def get_obj(url):
 html = requests.get(url).content.decode()
 obj_URL = re.findall('"objURL":"(.*?)",',html,re.S)
 return obj_URL
 
 def save_pic():
 pool=Pool(5)
 objurls = pool.map(get_obj,URLS)
 i = 1
 for objurl in objurls:
 for obj in objurl:
 print('开始下载图片'+'\t'+'第'+str(i)+'张')
 try :
 pic = requests.get(obj,timeout = 10)
 except requests.exceptions.ConnectionError:
 print('图片无法下载')
 continue
 except requests.exceptions.ReadTimeout:
 print('requests.exceptions.ReadTimeout')
 continue
 global search_id
 main_path = patha +'\\' + search_id +'\\'
 if not os.path.exists(main_path):
 os.makedirs(main_path)
 dir = main_path + search_id+ str(i) + '.jpg'
 with open(dir,'wb') as f:
 f.write(pic.content)
 i += 1
 
 
 if __name__ =='__main__':
 search_id = input('请输入要下载的内容:')
 URLS = get_urls(search_id)
 patha = input('输入文件保存路径----示例:E:\\baidu----:')
 save_pic()
 
 | 
3. 爬取 ins 上的图片(初级版)
| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 
 | import requestsimport json
 import lxml.html
 import re
 import os
 
 
 def get_src():
 url = 'https://www.instagram.com/baaaakuuuu'
 html = requests.get(url).content.decode()
 selector = lxml.html.fromstring(html)
 script = selector.xpath('/html/body/script[1]/text()')[0].strip()
 
 
 
 
 
 
 
 src = re.findall(r'"thumbnail_resources":\[(.*?)\]',script,re.S)
 
 
 
 return src
 
 
 def get_picurl():
 src = get_src()
 
 
 pic_url_lst = []
 for src_ls in src :
 thumb = re.findall(r'"config_height":480},{(.*?),"config_width":640,"config_height":640}',src_ls)[0]
 thumb_json = '{' + thumb + '}'
 
 
 thumb_py = json.loads(thumb_json)
 pic_url = thumb_py['src']
 
 
 pic_url_lst.append(pic_url)
 
 
 return pic_url_lst
 
 
 
 def save_pic():
 pic_url_lst = get_picurl()
 i = 1
 
 
 for pic_con in pic_url_lst:
 
 
 try:
 pic = requests.get(pic_con, timeout=10)
 main_path = 'E:/ins/'
 if not os.path.exists(main_path):
 os.makedirs(main_path)
 path = 'E:/ins/' + 'baku' + str(i) + '.jpg'
 with open(path,'wb') as f:
 f.write(pic.content)
 print(f'第{i}张已下载')
 i +=1
 except requests.exceptions.ConnectionError:
 print('图片无法下载')
 continue
 return
 
 save_pic()
 
 | 
4. 爬取 Wallhaven 上的图片
4.1 龟速爬取,只是用来爬了一下博客需要的图片 hhh
_爬取速度慢,要等半天才能开始保存文件,应该是我代码结构的问题,以后再做优化_
| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 
 | import lxml.htmlimport requests
 import re
 import os
 
 headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36"}
 
 def get_url():
 pages = input('输入页数:')
 
 url_pics = []
 page = 1
 while page <= int(pages):
 url = 'https://wallhaven.cc/search?categories=010&purity=100&resolutions=1280x800&sorting=relevance&order=desc&page=' + str(page)
 html = requests.get(url,headers = headers).content.decode()
 
 
 selector = lxml.html.fromstring(html)
 
 url_pic = selector.xpath('//*[@id="thumbs"]/section/ul/li/figure/a/@href')
 url_pics.append(url_pic)
 page += 1
 print('得到了内层url')
 return url_pics
 
 def get_pic():
 url_pics = get_url()
 img_urls = []
 for urlst in url_pics:
 for url in urlst:
 htmlp = requests.get(url,headers = headers).content.decode()
 
 
 img_url = re.findall(r'"wallpaper" src="(.*?)"',htmlp,re.S)[0]
 img_urls.append(img_url)
 print('得到图片的url')
 return img_urls
 
 def get_img(imgurl_list):
 i = 1
 for url in imgurl_list:
 print('开始下载图片'+'\t'+'第'+str(i)+'张')
 try:
 pic = requests.get(url, timeout=10)
 except requests.exceptions.ConnectionError:
 print('图片无法下载')
 continue
 except requests.exceptions.ReadTimeout:
 print('requests.exceptions.ReadTimeout')
 continue
 
 main_path = r'E:\\wallhaven\\'
 if not os.path.exists(main_path):
 os.makedirs(main_path)
 
 
 dir = 'E:\\wallhaven\\' + str(i) +'.jpg'
 
 with open(dir, 'wb') as f:
 f.write(pic.content)
 i += 1
 
 if __name__ == '__main__':
 imgurl_list = get_pic()
 get_img(imgurl_list)
 
 | 
4.2 多线程爬取