from lxml import etree
from bs4 import BeautifulSoup
import requests
def readalight(address):
html = requests.get(address).content.decode('utf-8')
##获取网页代码
dom_tree = etree.HTML(html)
###XPath匹配
links = dom_tree.xpath('//div[@id="mainCnt"]/p/text()')
summary = dom_tree.xpath('//p[@class="summary"]/text()')
for i in summary:
print(i)
for i in links:
print("<p>"+i+"</p>")
return
#-*- coding: UTF-8 -*-
f = open('url.txt','r', encoding='UTF-8')
line = f.readline()
while line:
#print line,面跟 ',' 将忽略换行符
print(line, end = '')
readalight(line)
line = f.readline()
f.close()
以上为 python 逐条读取网址,xpath采集数据方案
