原理:利用xml.etree.ElementTree对html进行解析和写入
import xml.etree.ElementTree as ET # 解析html、xml def circle(): print(\'输入again回到程序开始处,输入exit退出程序\') tree = ET.parse(\'book.html\') # 解析文件 root = tree.getroot() # 获取根节点 link_div = root.find(\'div[@id=\"link\"]\') # 获取link区的div节点。注意find和findall只能查找直接子元素 body_div = root.find(\'div[@id=\"body\"]\') # 获取body区的div节点 h1List = link_div.findall(\'div\') count1 = 1 for i in h1List: # 列出所有h1标题 print(str(count1) + \'、\' + i.find(\'h1\').find(\'a\').text) count1 += 1 def create_h1(): # 创造h1节点 h1_text = input(\'输入章节名:\') if h1_text == \'again\': circle() elif h1_text == \'exit\': print(\'program terminated!\') else: h1_link_div = ET.Element(\'div\') # 创造节点 h1_link_h1 = ET.Element(\'h1\') h1_link_a = ET.Element(\'a\') h1_link_a.text = h1_text h1_link_a.set(\'href\', f\'#{len(h1List) + 1}\') # 设置a节点的属性 h1_link_h1.append(h1_link_a) # 悬挂节点 h1_link_div.append(h1_link_h1) link_div.append(h1_link_div) # 写入link区 h1_body_div = ET.Element(\'div\') # 创造节点 h1_body_h1 = ET.Element(\'h1\') h1_body_h1.text = h1_text h1_body_div.set(\'id\', f\'{len(h1List) + 1}\') h1_body_div.append(h1_body_h1) body_div.append(h1_body_div) # 写入body区 newTree = ET.ElementTree(root) # root为修改后的root newTree.write(\'book.xml\', encoding=\'utf-8\') # 重新写入xml,进行更新。需要声明编码,否则写入后会乱码 circle() def create_h2(): # 创造h2节点 h2_text = input(\'输入章节名:\') if h2_text == \'again\': circle() elif h2_text == \'exit\': print(\'program terminated!\') else: h2_link_div = ET.Element(\'div\') h2_link_h2 = ET.Element(\'h2\') h2_link_a = ET.Element(\'a\') h2_link_a.text = h2_text h2_link_a.set(\'href\', f\'#{len(h1List)}_{len(h2List) + 1}\') h2_link_h2.append(h2_link_a) h2_link_div.append(h2_link_h2) h1List[int(a) - 1].append(h2_link_div) h2_body_div = ET.Element(\'div\') h2_body_h2 = ET.Element(\'h2\') h2_body_h2.text = h2_text h2_body_div.set(\'id\', f\'{len(h1List)}_{len(h2List) + 1}\') h2_body_div.append(h2_body_h2) body_div.findall(\'div\')[int(a) - 1].append(h2_body_div) newTree = ET.ElementTree(root) newTree.write(\'book.xml\', encoding=\'utf-8\') circle() def create_statement(): # 创造h2下的内容 p_text = input(\'输入内容:\') if p_text == \'again\': circle() elif p_text == \'exit\': print(\'program terminated!\') else: p = ET.Element(\'p\') p.text = p_text h2.append(p) newTree = ET.ElementTree(root) newTree.write(\'book.xml\', encoding=\'utf-8\') create_statement() print(\'在此处创建输入0\\n选择章节输入序号\') a = input(\'输入:\') # 输入的为字符串而非数字 if a == \'0\': create_h1() else: h2List = link_div.findall(\'div\')[int(a) - 1].findall(\'div\') count2 = 1 for i in h2List: # 列出所有h2标题 print(str(count2) + \'、\' + i.find(\'h2\').find(\'a\').text) count2 += 1 print(\'在此处创建输入0\\n选择章节输入序号\') b = input(\'输入:\') if b == \'0\': create_h2() else: h2 = body_div.findall(\'div\')[int(a) - 1].findall(\'div\')[int(b) - 1] # 选择的h2节点 pList = h2.findall(\'p\') # 列出所有h2标题下的内容 for i in pList: print(i.text) create_statement() tree = ET.ElementTree(root) # root为修改后的root tree.write(\'book.html\', encoding=\'utf-8\') # 重新写入xml,进行更新。需要声明编码,否则写入后会乱码 circle() # 循环往复
注意千万要仔细检查,本人就是因为某个变量打错字了,没有发现,耽搁了很久……总之现在是顺利地解决了
来源:https://www.cnblogs.com/daxiangcai/p/16188213.html
本站部分图文来源于网络,如有侵权请联系删除。