python: xmlhelper
https://github.com/tesseract-ocr/tesseract
Tesseract引擎和中文包 (这是HP实验室最早开发的OCR)
https://pan.baidu.com/share/init?surl=XpeRVgiPTU7mmiMiyaXThg
pyth
https://digi.bib.uni-mannheim.de/tesseract/
https://github.com/ViewFaceCore/ViewFaceCore
pip install beautifulsoup4
xml:
<?xml version="1.0"?>
<data>
<country name="Liechtenstein">
<rank>1</rank>
<year>2008</year>
<gdppc>141100</gdppc>
<neighbor name="Austria" direction="E"/>
<neighbor name="Switzerland" direction="W"/>
</country>
<country name="Singapore">
<rank>4</rank>
<year>2011</year>
<gdppc>59900</gdppc>
<neighbor name="Malaysia" direction="N"/>
</country>
<country name="Panama">
<rank>68</rank>
<year>2011</year>
<gdppc>13600</gdppc>
<neighbor name="Costa Rica" direction="W"/>
<neighbor name="Colombia" direction="E"/>
</country>
</data>
# encoding: utf-8
# 版权所有 2023 涂聚文有限公司
# 许可信息查看:
# 描述: pip install beautifulsoup4
# Author : geovindu,Geovin Du 涂聚文.
# IDE : PyCharm 2023.1 python 311
# Datetime : 2023/7/16 22:17
# User : geovindu
# Product : PyCharm
# Project : pythonTkinterDemo
# File : XmlHelper.py
# explain : 学习
from xml.dom import minidom
import xml.etree.ElementTree as ET
import csv
import requests
import os
import sys
def readXml(url):
tree = ET.parse(url)
root = tree.getroot()
for child in root:
print(child.tag, child.attrib)
def writeXml(url):
# 实例化Document树
doc = minidom.Document()
# 创建根结点,XML必须存在root元素
root_node = doc.createElement('root')
# 将元素挂载在doc树中
doc.appendChild(root_node)
# 创建子元素
c_node1 = doc.createElement('movie')
root_node.appendChild(c_node1)
# 设置该元素存储数据
c_node1.setAttribute('shelf', 'New Arrivals')
# 二级子结点
c_node2 = doc.createElement('type')
c_node1.appendChild(c_node2)
# 也用DOM创建文本结点,把文本结点(文字内容)看成子结点
c_text = doc.createTextNode("War, Thriller")
c_node2.appendChild(c_text)
try:
with open(url, 'w', encoding='UTF-8') as f:
# 第一个参数是目标文件对象
doc.writexml(f, indent='', addindent='\t', newl='\n', encoding='UTF-8')
except Exception as e:
print('错误:', e)
def loadRSS():
# url of rss feed
url = 'http://www.hindustantimes.com/rss/topnews/rssfeed.xml'
# creating HTTP response object from given url
resp = requests.get(url)
# saving the xml file
with open('topnewsfeed.xml', 'wb') as f:
f.write(resp.content)
def parseXML(xmlfile):
# create element tree object
tree = ET.parse(xmlfile)
# get root element
root = tree.getroot()
# create empty list for news items
newsitems = []
# iterate news items
for item in root.findall('./channel/item'):
# empty news dictionary
news = {}
# iterate child elements of item
for child in item:
# special checking for namespace object content:media
if child.tag == '{http://search.yahoo.com/mrss/}content':
news['media'] = child.attrib['url']
else:
news[child.tag] = child.text.encode('utf8')
# append news dictionary to news items list
newsitems.append(news)
# return news items list
return newsitems
def savetoCSV(newsitems, filename):
# specifying the fields for csv file
fields = ['guid', 'title', 'pubDate', 'description', 'link', 'media']
# writing to csv file
with open(filename, 'w') as csvfile:
# creating a csv dict writer object
writer = csv.DictWriter(csvfile, fieldnames=fields)
# writing headers (field names)
writer.writeheader()
# writing data rows
writer.writerows(newsitems)
def main():
# load rss from web to update existing xml file
loadRSS()
# parse xml file
newsitems = parseXML('topnewsfeed.xml')
# store news items in a csv file
savetoCSV(newsitems, 'geovindu.csv')
哲学管理(学)人生, 文学艺术生活, 自动(计算机学)物理(学)工作, 生物(学)化学逆境, 历史(学)测绘(学)时间, 经济(学)数学金钱(理财), 心理(学)医学情绪, 诗词美容情感, 美学建筑(学)家园, 解构建构(分析)整合学习, 智商情商(IQ、EQ)运筹(学)生存.---Geovin Du(涂聚文)