import requests import re import numpy as np from bs4 import BeautifulSoup #目标url url='http://www.ibiqu.org/148_148106/' #主页网站,不加的话还要后面分离链接 url2='http://www.ibiqu.org' #定义头文件 head_bqg={ 'User-Agent':'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Mobile Safari/537.36 Edg/114.0.1823.37' } html_zhuye=requests.get(url,headers=head_bqg) html_1=BeautifulSoup(html_zhuye.text,'html.parser') html_1.select('body > div.cover > ul > a >href') html_1=str(html_1) ex='<dd><a href="(.*?)".*?' ex=re.compile(ex) imglists = re.findall(ex, html_1) url_lists=np.array([]) for imglist in imglists: url_max=f'{url2}{imglist}' url_lists=np.append(url_lists,url_max)
print(url_lists) file_1= open('114514.txt','w') for url_list in url_lists: txt_novel=requests.get(url_list,headers=head_bqg) ex='<div id="content">(.*?)</div>' re.compile(ex) txt_2=re.findall(ex,txt_novel.text) ex_1='<p>\u3000\u3000|</p>' re.compile(ex_1) for txt_3 in txt_2: txt_3=re.sub(ex_1,'',txt_3) file_1.writelines(f'{txt_3}\n') # for txt_3 in txt_2: # file_1.writelines(f'{txt_3}\n')
file_1.close()
问题总结
python列表为空的原因导致索引错误,继而导致找不到索引
个人总结
不要图省事,至少在报错的时候最好用最基础的方法试一遍
python爬虫“indexerror:list index out of range”错误及其解决办法