源码:
1 import requests 2 import re 3 from my_mysql import MysqlConnect 4 5 6 # 获取问答信息 7 def get_contents(page,headers): 8 url = 'https://www.zhihu.com/api/v4/members/chen-lu-ya-26/answers?include=data%5B*%5D.is_normal%2Cadmin_closed_comment%2Creward_info%2Cis_collapsed%2Cannotation_action%2Cannotation_detail%2Ccollapse_reason%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2Ccan_comment%2Ccontent%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2Ccreated_time%2Cupdated_time%2Creview_info%2Cquestion%2Cexcerpt%2Crelationship.is_authorized%2Cvoting%2Cis_author%2Cis_thanked%2Cis_nothelp%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset={}&limit=20&sort_by=created'.format(page) 9 req = requests.get(url,headers=headers)10 html_json_dict = req.json()11 # print(html_json_dict)12 data_list = html_json_dict['data']13 contents = []14 for item in data_list:15 question = item['question']['title']16 excerpt = item['excerpt']17 if '<' in excerpt:18 pat = r'(.*?)<.*>(.*)'19 res = re.search(pat, excerpt)20 front = res.group(1)21 back = res.group(2)22 pat = r'<.*?>(.*?)<.*?>'23 res = re.findall(pat, excerpt)24 middle = ' '.join(res)25 excerpt = front + middle + back26 contents.append((question,excerpt))27 return contents28 29 if __name__ == '__main__':30 headers = {31 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'32 }33 mc = MysqlConnect('127.0.0.1','root','123456','homework')34 for page in range(0,20*8,20):35 contents = get_contents(page, headers)36 # print(contents)37 for content in contents:38 sql = 'insert into zhihu values(null,%s,%s)'39 mc.exec_data(sql,content)40 print(content)