博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
XPath的使用[爬取知乎发现]文件存储[txt,json,csv,mongodb]
阅读量:6921 次
发布时间:2019-06-27

本文共 8624 字,大约阅读时间需要 28 分钟。

使用XPath

1 import requests 2 import json 3 from lxml import etree 4 from urllib import parse 5  6 url = 'https://www.zhihu.com/explore' 7 headers = { 8     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 9 }10 html = requests.get(url, headers=headers).text11 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)12 text = etree.HTML(html)13 # 返回所有内容的结点位置14 node_list = text.xpath('//div[@class="explore-feed feed-item"]')15 items ={}16 for node in node_list:17     # xpath返回的列表,这个列表就这一个参数,用索引方式取出来18     #问题19     question = node.xpath('.//h2/a')[0].text.replace("\n","")20     # 作者21     author = node.xpath('.//*[@class="author-link-line"]/*')[0].text22     #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")23     # 回答24     answer = node.xpath('.//*[@class="content"]')[0].text25     #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()26     #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]27 28     items = {29         "question" : question,30         "author" : author,31         "answer" : answer,32     } 33 34     with open("explore.json", "a") as f:35         #f.write(json.dumps(items, ensure_ascii = False).encode("utf-8") + "\n")36         f.write(json.dumps(items, ensure_ascii = False) + "\n")

 

 

保存为TXT  

1 import requests 2  3 from lxml import etree 4 from urllib import parse 5  6 url = 'https://www.zhihu.com/explore' 7 headers = { 8     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 9 }10 html = requests.get(url, headers=headers).text11 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)12 text = etree.HTML(html)13 # 返回所有内容的结点位置14 node_list = text.xpath('//div[@class="explore-feed feed-item"]')15 16 for node in node_list:17     # xpath返回的列表,这个列表就这一个参数,用索引方式取出来18     #问题19     question = node.xpath('.//h2/a')[0].text.replace("\n","")20     # 作者21     author = node.xpath('.//*[@class="author-link-line"]/*')[0].text22     #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")23     # 回答24     answer = node.xpath('.//*[@class="content"]')[0].text25     #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()26     #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]27 28     with open('explore.txt', 'a', encoding='utf-8') as file:29         file.write('\n'.join([question, author, answer]))30         file.write('\n' + '=' * 50 + '\n')

 

 

 

 保存为csv

1 import requests 2 from lxml import etree 3 from urllib import parse 4 import csv 5  6 url = 'https://www.zhihu.com/explore' 7 headers = { 8     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 9 }10 html = requests.get(url, headers=headers).text11 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)12 text = etree.HTML(html)13 # 返回所有内容的结点位置14 node_list = text.xpath('//div[@class="explore-feed feed-item"]')15 16 for node in node_list:17     # xpath返回的列表,这个列表就这一个参数,用索引方式取出来18     #问题19     question = node.xpath('.//h2/a')[0].text.replace("\n","")20     # 作者21     author = node.xpath('.//*[@class="author-link-line"]/*')[0].text22     #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")23     # 回答,为方便展示,只取部分内容,text[ :10]24     answer = node.xpath('.//*[@class="content"]')[0].text[ :10]25 26     #answer = node.xpath('.//*[@class="content"]')[0].text27     #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()28     #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]29 30 31     with open('explore.csv', 'a', encoding='utf-8') as csvfile:32         fieldnames = ['question', 'author', 'answer']33         writer = csv.DictWriter(csvfile, fieldnames=fieldnames)34         writer.writeheader()35         writer.writerow({
'question': question, 'author': author, 'answer': answer})

读取csv

 

1 import csv2 3 with open('explore.csv', 'r', encoding='utf-8') as csvfile:4     reader = csv.reader(csvfile)5     for row in reader:6         print(row)

 

Excel打开乱码参考:

 

保存到MongoDB

1 import requests 2 from lxml import etree 3 from urllib import parse 4 from pymongo import MongoClient 5  6 client = MongoClient() 7 db = client['explore'] 8 collection = db['explore'] 9 10 url = 'https://www.zhihu.com/explore'11 headers = {12     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'13 }14 html = requests.get(url, headers=headers).text15 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)16 text = etree.HTML(html)17 # 返回所有内容的结点位置18 node_list = text.xpath('//div[@class="explore-feed feed-item"]')19 20 for node in node_list:21     # xpath返回的列表,这个列表就这一个参数,用索引方式取出来22     #问题23     question = node.xpath('.//h2/a')[0].text.replace("\n","")24     # 作者25     author = node.xpath('.//*[@class="author-link-line"]/*')[0].text26     #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")27     # 回答28     answer = node.xpath('.//*[@class="content"]')[0].text29     #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()30     #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]31 32     items = {33         "question" : question,34         "author" : author,35         "answer" : answer,36     } 37 38     if collection.insert(items):39         print('Saved to Mongo')

 

稍微改动

1 import requests 2 from lxml import etree 3 from urllib import parse 4 from pymongo import MongoClient 5  6 client = MongoClient() 7 db = client['explore'] 8 collection = db['explore'] 9 10 url = 'https://www.zhihu.com/explore'11 headers = {12     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'13 }14 html = requests.get(url, headers=headers).text15 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)16 text = etree.HTML(html)17 # 返回所有内容的结点位置18 node_list = text.xpath('//div[@class="explore-feed feed-item"]')19 explore ={}20 for node in node_list:21     # xpath返回的列表,这个列表就这一个参数,用索引方式取出来22     #问题23     explore['question'] = node.xpath('.//h2/a')[0].text.replace("\n","")24     # 作者25     explore['author'] = node.xpath('.//*[@class="author-link-line"]/*')[0].text26     #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")27     # 回答28     explore['answer'] = node.xpath('.//*[@class="content"]')[0].text29     #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()30     #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]31 32     if collection.insert(explore):33         print('Saved to Mongo')

 

抛出异常

    raise DuplicateKeyError(error.get("errmsg"), 11000, error)

pymongo.errors.DuplicateKeyError: E11000 duplicate key error collection: explore.explore index: _id_ dup key: { : ObjectId('5b3792ae393e0d0c38123bbc') }

 

id用于记录唯一的一条Schema,Schema本身就有标识唯一性的属性,它就是:ObjectId主键,一种特殊而且非常重要的类型,每个Schema都会默认配置这个属性,属性名为_id

MongoDB 默认会创建一个索引_id

 

1 import requests 2 from lxml import etree 3 from urllib import parse 4 from pymongo import MongoClient 5 import pymongo.errors 6  7 client = MongoClient() 8 db = client['explore'] 9 collection = db['explore']10 11 url = 'https://www.zhihu.com/explore'12 headers = {13     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'14 }15 html = requests.get(url, headers=headers).text16 # 响应返回的是字符串,解析为HTML DOM模式 text = etree.HTML(html)17 text = etree.HTML(html)18 # 返回所有内容的结点位置19 node_list = text.xpath('//div[@class="explore-feed feed-item"]')20 explore ={}21 for node in node_list:22     # xpath返回的列表,这个列表就这一个参数,用索引方式取出来23     #问题24     explore['question'] = node.xpath('.//h2/a')[0].text.replace("\n","")25     # 作者26     explore['author'] = node.xpath('.//*[@class="author-link-line"]/*')[0].text27     #author = "".join(node.xpath('.//*[@class="author-link-line"]//text()')).replace("\n","")28     # 回答29     explore['answer'] = node.xpath('.//*[@class="content"]')[0].text30     #answer = "".join(node.xpath('.//*[@class="content"]/text()')).strip()31     #answer = str(node.xpath('.//*[@class="content"]/text()'))[1:-1]32     try:33         if collection.insert(explore):34             print('Saved to Mongo')35     except pymongo.errors.DuplicateKeyError:36         # 对唯一字段进行重复插入,pymongo则会抛出这个错误,并且插入失败37         print("重复插入")38         pass

 

还是不能插入数据。

 

转载于:https://www.cnblogs.com/wanglinjie/p/9248573.html

你可能感兴趣的文章
通过虚拟机VMware来练习安装ESXi
查看>>
Mybatis深度整合Mysql的Json字段
查看>>
程序清单3.2_print1.c_程序_《C Primer Plus》P37
查看>>
文档注释
查看>>
自然语言处理之:搭建基于HanLP的开发环境(转)
查看>>
linux 系统 UDP 丢包
查看>>
AECC 2015绿色版下载|adobe after effects cc 2015绿色版下载 v1
查看>>
Ansible之playbook(剧本)介绍与案例分析
查看>>
IPFS矿机你需要看懂的五大元素
查看>>
截图留存
查看>>
web复习day03:request
查看>>
LVS负载均衡
查看>>
RabbitMQ使用
查看>>
电子签名助力电子处方合规高效,实现医院管理全程电子化
查看>>
contentType
查看>>
Python3网络爬虫(十一):爬虫黑科技之让你的爬虫程序更像人类用户的行为(代理IP池等)...
查看>>
好程序员web前端分享CSS元素类型
查看>>
成都大数据语言培训:改变人们的生活的大数据趋势
查看>>
tomcat6-7配置管理用户
查看>>
意法半导体 STM32F030F4 芯片解密 工业控制
查看>>