网站截图:
源代码:
1 import re
2 import requests
3 import pymysql
4 from bs4 import BeautifulSoup
5 import lxml
6 import traceback
7 import time
8 import json
9 from lxml import etree
10 def query(sql,*args):
11 \"\"\"
12 封装通用查询
13 :param sql:
14 :param args:
15 :return: 返回查询结果以((),(),)形式
16 \"\"\"
17 conn,cursor = get_conn();
18 cursor.execute(sql)
19 res=cursor.fetchall()
20 close_conn(conn,cursor)
21 return res
22 def get_paper():
23 #https://www.ecva.net/papers/eccv_2020/papers_ECCV/html/343_ECCV_2020_paper.php
24 url=\'https://www.ecva.net/papers.php\'
25 headers = {
26 \'User-Agent\': \'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36\'
27 }
28 response=requests.get(url,headers)
29 response.encoding=\'utf-8\'
30 page_text=response.text
31 #输出页面html
32 # print(page_text)
33 soup = BeautifulSoup(page_text,\'lxml\')
34 all_dt=soup.find_all(\'dt\',class_=\'ptitle\')
35 print(\"dt:\"+str(len(all_dt)))
36 #暂存信息
37 temp_res=[]
38 #最后结果集
39 res=[]
40 #链接
41 link_res = []
42 for dt in all_dt:
43 single_dt=str(dt)
44 single_soup=BeautifulSoup(single_dt,\'lxml\')
45 title=single_soup.find(\'a\').text
46 #存标题
47 temp_res.append(title[2:])
48 #存摘要
49
50 #存关键字
51
52 #存源链接
53 sourcelink=single_soup.find(\'a\')[\'href\']
54 sourcelink=\"https://www.ecva.net/\"+sourcelink
55 temp_res.append(sourcelink)
56 res.append(temp_res)
57 temp_res=[]
58 #爬取作者和pdf文件链接
59 all_dd=soup.find_all(\'dd\')
60 print(\"dd:\"+str(len(all_dd)))
61 flag=0
62 temp_link=[]
63 author=[] #作者列表 一层list
64 for item in all_dd:
65 if(flag%2==0):
66 #保存作者
67 author.append(item)
68 else:
69 linktext=str(item)
70 linksoup=BeautifulSoup(linktext,\'lxml\')
71 link_list=linksoup.find_all(\'a\')
72 for i in link_list:
73 if(i.get(\'href\')==None):
74 temp_link.append(\"fakelink\")
75 else:
76 # print(i)
77 if(\"http\" not in str(i.get(\'href\')) and \"papers\" in str(i.get(\'href\'))):
78 temp_link.append((\"https://www.ecva.net/\"+str(i.get(\'href\'))))
79 else:
80 temp_link.append(i.get(\'href\'))
81 print(temp_link)
82 link_res.append(temp_link)
83 temp_link=[]
84 #解析download 和 pdfinfo
85 flag = flag + 1
86 \"\"\"
87 继续使用beautifulsoup
88 download_text 和 pdfinfo_text
89 存储author
90 \"https://www.ecva.net/\"
91 \"\"\"
92 linkflag=1
93 print(\"------------------------------\")
94 #把作者和download pdfinfo 存到res
95 for i in range(0,len(author)):
96 #添加作者
97 str_author=str(author[i])
98 new_author=str_author.replace(\"<dd>\",\"\")
99 new_author=new_author.replace(\" </dd>\",\"\")
100 new_author = new_author.replace(\"\\n\", \"\")
101 res[i].append(new_author)
102 # print(\"link_res:\"+str(len(link_res)))
103 if(len(link_res[i])==2):
104 #添加download
105 res[i].append(link_res[i][0])
106 #添加pdfinfo
107 res[i].append(link_res[i][1])
108 else:
109 # 添加download
110 res[i].append(link_res[i][0])
111 # 添加pdfinfo
112 res[i].append(link_res[i][2])
113 print(\"----------------------\")
114 # print(len(author))
115 # print(len(download))
116 # print(len(pdfinfo))
117 # for item in res:
118 # print(item)
119 return res
120 #############################################################
121 #继续爬取abstract 和 keyword
122 def get_further():
123 res=get_paper()
124 temp_res=[]
125 further_res=[]
126 db_res=[]
127 sql=\"SELECT pdfinfo FROM pdf;\"
128 db_res=query(sql) #返回元祖 要继续[0]访问数据
129 #对结果集的链接发起请求
130 for i in range(1358,len(db_res)):
131 url=db_res[i][0] #获取url
132 print(url)
133 headers={
134 \"User-Agent\":\"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \"
135 \"Chrome/91.0.4472.101 Safari/537.36\"
136 }
137 try:
138 response=requests.get(url,headers)
139 response.encoding = \"utf-8\"
140 page_text = response.text
141 # print(page_text)
142 soup = BeautifulSoup(page_text, \'lxml\')
143
144 abstract = soup.find(\'p\', id=\'Par1\').text
145 #去掉\\n
146 abstract = abstract.replace(\"\\n\",\"\")
147 print(\"摘要:\"+abstract)
148 keyword = soup.find_all(\'span\', class_=\"Keyword\")
149 # print(keyword)
150 # find_keyword=re.compile(\'<span class=\"Keyword\">(.*?)</span>\')
151 keyword_str = \"\"
152 for items in keyword:
153 # 获取所有文本
154 keyword_str = keyword_str + items.get_text()
155 print(\"关键字:\"+keyword_str)
156 #去掉 \\xa0
157 keyword_str=keyword_str.replace(\"\\xa0\",\",\")
158 #去掉末尾的一个逗号
159 keyword_str = keyword_str[0:-1]
160 # 最后添加 摘要和关键字
161 temp_res.append(abstract)
162 temp_res.append(keyword_str)
163 further_res.append(temp_res)
164 print(temp_res)
165 print(\"~~~~~~~~~~~~~~~~~~~~~~~~~~~\")
166 temp_res = []
167 except:
168 print(\"链接无效!\")
169 try:
170 if(len(further_res[i][0])==0):
171 res[i].append(\"no abstract\")
172 else:
173 res[i].append(further_res[i][0])
174 if(len(further_res[i][1])==0):
175 res[i].append(\"no keyword\")
176 else:
177 res[i].append(further_res[i][1])
178 print(res[i])
179 # 插入数据库
180 # insert_paper_1(res[i], i)
181 except:
182 print(\"IndexError: list index out of range\")
183 return
184
185 #连接数据库 获取游标
186 def get_conn():
187 \"\"\"
188 :return: 连接,游标
189 \"\"\"
190 # 创建连接
191 conn = pymysql.connect(host=\"127.0.0.1\",
192 user=\"root\",
193 password=\"000429\",
194 db=\"paperinfo\",
195 charset=\"utf8\")
196 # 创建游标
197 cursor = conn.cursor() # 执行完毕返回的结果集默认以元组显示
198 if ((conn != None) & (cursor != None)):
199 print(\"数据库连接成功!游标创建成功!\")
200 else:
201 print(\"数据库连接失败!\")
202 return conn, cursor
203 #关闭数据库连接和游标
204 def close_conn(conn, cursor):
205 if cursor:
206 cursor.close()
207 if conn:
208 conn.close()
209 return 1
210 def insert_paper_0():
211 conn,cursor=get_conn()
212 res=get_paper()
213 print(f\"{time.asctime()}开始插入论文详情数据\")
214 try:
215 sql = \"insert into paper (title,sourcelink,author,download,abstract,keyword) values(%s,%s,\" \\
216 \"%s,%s,%s,%s)\"
217 for item in res:
218 print(item)
219 # 异常捕获,防止数据库主键冲突
220 try:
221 cursor.execute(sql, [item[0], item[1], item[2], item[3],\"\",\"\"])
222 except pymysql.err.IntegrityError:
223 print(\"重复!\")
224 print(\"###########################\")
225 conn.commit() # 提交事务 update delete insert操作
226 print(f\"{time.asctime()}插入论文详情数据完毕\")
227 except:
228 traceback.print_exc()
229 finally:
230 close_conn(conn, cursor)
231 return
232 #########################################
233 def insert_paper_1(res,count):
234 conn,cursor=get_conn()
235 print(f\"{time.asctime()}开始插入论文详情数据\")
236 try:
237 sql = \"insert into paper (title,sourcelink,author,download,abstract,keyword) values(%s,%s,\" \\
238 \"%s,%s,%s,%s)\"
239 print(res)
240 # 异常捕获,防止数据库主键冲突
241 try:
242 cursor.execute(sql, [res[0], res[1], res[2], res[3],res[5],res[6]])
243 except pymysql.err.IntegrityError:
244 print(\"重复!\")
245 print(\"###########################\")
246 conn.commit() # 提交事务 update delete insert操作
247 print(f\"{time.asctime()}插入第\"+str(count+1)+\"条论文详情数据完毕\")
248 except:
249 traceback.print_exc()
250 finally:
251 close_conn(conn, cursor)
252 return
253
254 #单独插入 pdfinfo
255 def inseet_pdf():
256 conn, cursor = get_conn()
257 res=get_paper()
258 print(f\"{time.asctime()}开始插入论文pdfinfo数据\")
259 try:
260 sql = \"insert into pdf (id,pdfinfo) values(%s,%s)\"
261 # 异常捕获,防止数据库主键冲突
262 for item in res:
263 print(item)
264 # 异常捕获,防止数据库主键冲突
265 try:
266 cursor.execute(sql, [0,item[4]])
267 except pymysql.err.IntegrityError:
268 print(\"重复!\")
269 print(\"###########################\")
270 conn.commit() # 提交事务 update delete insert操作
271 print(f\"{time.asctime()}插入论文pdfinfo完毕\")
272 except:
273 traceback.print_exc()
274 finally:
275 close_conn(conn, cursor)
276 return
277 if (__name__==\'__main__\'):
278 get_further()
279 # inseet_pdf()
来源:https://www.cnblogs.com/rainbow-1/p/14880491.html
图文来源于网络,如有侵权请联系删除。