使用XPath
1 import requests 2 import json 3 from lxml import etree 4 from urllib import parse 5 6 url = 'https://www.zhihu.com/explore' 7 headers = { 8 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 9 }10 html = requests.get(url, headers=headers).text11 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)12 text = etree.HTML(html)13 # 返回所有内容的结点位置14 node_list = text.xpath('//div[@class="explore-feed feed-item"]')15 items ={}16 for node in node_list:17 # xpath返回的列表,这个列表就这一个参数,用索引方式取出来18 #问题19 question = node.xpath('.//h2/a')[0].text.replace("\n","")20 # 作者21 author = node.xpath('.//*[@class="author-link-line"]/*')[0].text22 #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")23 # 回答24 answer = node.xpath('.//*[@class="content"]')[0].text25 #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()26 #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]27 28 items = {29 "question" : question,30 "author" : author,31 "answer" : answer,32 } 33 34 with open("explore.json", "a") as f:35 #f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n")36 f.write(json.dumps(items, ensure_ascii = False) + "\n")
保存为TXT
1 import requests 2 3 from lxml import etree 4 from urllib import parse 5 6 url = 'https://www.zhihu.com/explore' 7 headers = { 8 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 9 }10 html = requests.get(url, headers=headers).text11 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)12 text = etree.HTML(html)13 # 返回所有内容的结点位置14 node_list = text.xpath('//div[@class="explore-feed feed-item"]')15 16 for node in node_list:17 # xpath返回的列表,这个列表就这一个参数,用索引方式取出来18 #问题19 question = node.xpath('.//h2/a')[0].text.replace("\n","")20 # 作者21 author = node.xpath('.//*[@class="author-link-line"]/*')[0].text22 #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")23 # 回答24 answer = node.xpath('.//*[@class="content"]')[0].text25 #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()26 #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]27 28 with open('explore.txt', 'a', encoding='utf-8') as file:29 file.write('\n'.join([question, author, answer]))30 file.write('\n' + '=' * 50 + '\n')
保存为csv
1 import requests 2 from lxml import etree 3 from urllib import parse 4 import csv 5 6 url = 'https://www.zhihu.com/explore' 7 headers = { 8 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 9 }10 html = requests.get(url, headers=headers).text11 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)12 text = etree.HTML(html)13 # 返回所有内容的结点位置14 node_list = text.xpath('//div[@class="explore-feed feed-item"]')15 16 for node in node_list:17 # xpath返回的列表,这个列表就这一个参数,用索引方式取出来18 #问题19 question = node.xpath('.//h2/a')[0].text.replace("\n","")20 # 作者21 author = node.xpath('.//*[@class="author-link-line"]/*')[0].text22 #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")23 # 回答,为方便展示,只取部分内容,text[ :10]24 answer = node.xpath('.//*[@class="content"]')[0].text[ :10]25 26 #answer = node.xpath('.//*[@class="content"]')[0].text27 #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()28 #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]29 30 31 with open('explore.csv', 'a', encoding='utf-8') as csvfile:32 fieldnames = ['question', 'author', 'answer']33 writer = csv.DictWriter(csvfile, fieldnames=fieldnames)34 writer.writeheader()35 writer.writerow({ 'question': question, 'author': author, 'answer': answer})
读取csv
1 import csv2 3 with open('explore.csv', 'r', encoding='utf-8') as csvfile:4 reader = csv.reader(csvfile)5 for row in reader:6 print(row)
Excel打开乱码参考:
保存到MongoDB
1 import requests 2 from lxml import etree 3 from urllib import parse 4 from pymongo import MongoClient 5 6 client = MongoClient() 7 db = client['explore'] 8 collection = db['explore'] 9 10 url = 'https://www.zhihu.com/explore'11 headers = {12 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'13 }14 html = requests.get(url, headers=headers).text15 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)16 text = etree.HTML(html)17 # 返回所有内容的结点位置18 node_list = text.xpath('//div[@class="explore-feed feed-item"]')19 20 for node in node_list:21 # xpath返回的列表,这个列表就这一个参数,用索引方式取出来22 #问题23 question = node.xpath('.//h2/a')[0].text.replace("\n","")24 # 作者25 author = node.xpath('.//*[@class="author-link-line"]/*')[0].text26 #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")27 # 回答28 answer = node.xpath('.//*[@class="content"]')[0].text29 #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()30 #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]31 32 items = {33 "question" : question,34 "author" : author,35 "answer" : answer,36 } 37 38 if collection.insert(items):39 print('Saved to Mongo')
稍微改动
1 import requests 2 from lxml import etree 3 from urllib import parse 4 from pymongo import MongoClient 5 6 client = MongoClient() 7 db = client['explore'] 8 collection = db['explore'] 9 10 url = 'https://www.zhihu.com/explore'11 headers = {12 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'13 }14 html = requests.get(url, headers=headers).text15 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)16 text = etree.HTML(html)17 # 返回所有内容的结点位置18 node_list = text.xpath('//div[@class="explore-feed feed-item"]')19 explore ={}20 for node in node_list:21 # xpath返回的列表,这个列表就这一个参数,用索引方式取出来22 #问题23 explore['question'] = node.xpath('.//h2/a')[0].text.replace("\n","")24 # 作者25 explore['author'] = node.xpath('.//*[@class="author-link-line"]/*')[0].text26 #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")27 # 回答28 explore['answer'] = node.xpath('.//*[@class="content"]')[0].text29 #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()30 #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]31 32 if collection.insert(explore):33 print('Saved to Mongo')
抛出异常
raise DuplicateKeyError(error.get("errmsg"), 11000, error)
pymongo.errors.DuplicateKeyError: E11000 duplicate key error collection: explore.explore index: _id_ dup key: { : ObjectId('5b3792ae393e0d0c38123bbc') }
id用于记录唯一的一条Schema,Schema本身就有标识唯一性的属性,它就是:ObjectId主键,一种特殊而且非常重要的类型,每个Schema都会默认配置这个属性,属性名为_idMongoDB 默认会创建一个索引_id
1 import requests 2 from lxml import etree 3 from urllib import parse 4 from pymongo import MongoClient 5 import pymongo.errors 6 7 client = MongoClient() 8 db = client['explore'] 9 collection = db['explore']10 11 url = 'https://www.zhihu.com/explore'12 headers = {13 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'14 }15 html = requests.get(url, headers=headers).text16 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)17 text = etree.HTML(html)18 # 返回所有内容的结点位置19 node_list = text.xpath('//div[@class="explore-feed feed-item"]')20 explore ={}21 for node in node_list:22 # xpath返回的列表,这个列表就这一个参数,用索引方式取出来23 #问题24 explore['question'] = node.xpath('.//h2/a')[0].text.replace("\n","")25 # 作者26 explore['author'] = node.xpath('.//*[@class="author-link-line"]/*')[0].text27 #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")28 # 回答29 explore['answer'] = node.xpath('.//*[@class="content"]')[0].text30 #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()31 #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]32 try:33 if collection.insert(explore):34 print('Saved to Mongo')35 except pymongo.errors.DuplicateKeyError:36 # 对唯一字段进行重复插入,pymongo则会抛出这个错误,并且插入失败37 print("重复插入")38 pass
还是不能插入数据。