string.gne一个用于从html中提取新闻正文的库
By
money
at 2021-11-15 • 0人收藏 • 1300人看过
//模仿自@青南 GNE(GeneralNewsExtractor) python库 import string.html; import math; import string.regex; namespace string.gne{ import console content_tag = 'p'; rep = ..string.replace; ncr = ..string.xml.ncr; removeTag = ..string.html.removeTag; match = ..string.match; push = ..table.push; len = ..string.len; join = ..string.join; split = ..string.split; abs = ..math.abs; punctuation = { ["!"]=1;[","]=1;["。"]=1;["?"]=1;[";"]=1;[":"]=1;["“"]=1;["”"]=1;["‘"]=1;["’"]=1;["《"]=1;["》"]=1;["("]=1; [")"]=1;["【"]=1;["】"]=1;["、"]=1;["—"]=1;["…"]=1;["~"]=1;["·"]=1;["〉"]=1;["〈"]=1; [","]=1;["."]=1;["?"]=1;[":"]=1;[";"]=1;["'"]=1;[" "]=1; ['"']=1;["!"]=1;["%"]=1;["("]=1;[")"]=1 }; uselesstag={ ['img']=1;['svg']=1;['video']=1;['object']=1;['embed']=1;['audio']=1;['applet']=1; ['map']=1;['area']=1;['base']=1;['head']=1;['basefont']=1;['br']=1;['button']=1; ['input']=1;['canvas']=1;['iframe']=1;['frame']=1;['frameset']=1; } uselesstag1={ ['a']=1; } authorPerlHeads={ "责编"; "作者"; "编辑"; "文"; "撰文"; "来源" } authorPerlTail = "[:\: 丨/]\s*(:{2,5})[^:\::]*"; pubdayPerls={ "(202\d-\d{2}-\d{2} \d{2}\:\d{2}\:\d{2})"; "(202\d/\d{2}/\d{2} \d{2}\:\d{2}\:\d{2})"; "(202\d-\d{2}-\d{2} \d{2}\:\d{2})[^\:]"; "(202\d/\d{2}/\d{2} \d{2}\:\d{2})[^\:]"; "(202\d年\d{2}月\d{2}日 \d{2}时\d{2}分)"; "(202\d年\d{2}月\d{2}日 \d{2}点\d{2}分)"; "(202\d-\d{2}-\d{2}) "; "(202\d/\d{2}/\d{2}) "; "(202\d年\d{2}月\d{2}日) "; "(202\d-\d{1,2}-\d{1,2})"; "(202\d/\d{1,2}/\d{1,2})"; "(202\d年\d{1,2}月\d{1,2}日)"; } //方差s^2=[(x1-x)^2 +...(xn-x)^2]/n 或者s^2=[(x1-x)^2 +...(xn-x)^2]/(n-1) variance = function(arr) { var m=#arr; var sum=0; for(i=1;m;1){//求和 sum += arr[i]; } var dAve=sum/m;//求平均值 var dVar=0; for(i=1;m;1){//求方差 dVar += (arr[i]-dAve) * (arr[i]-dAve); } return dVar/m; } //标准差σ=sqrt(s^2) std = function(arr) { return ..math.sqrt(variance(arr)); } toText = function(html){ if(!#html) return; html = rep(html, "\<\s*pre[^\>]*?\s*\>(.*?)\<\s*/pre\s*>", function(c){ c = rep(c," "," "); c = rep(c,'\n',"<br>"); return c; } ); html = rep(html,"\s+"," "); html = rep(html,"\<[bB][rR]\s*/*\>",'\r\n'); html = rep(html,"\</*[pP]\>",'\r\n'); html = rep(html,"\</div\>",'\r\n'); html = rep(html,"\</DIV\>",'\r\n'); html = rep(html,"\<.+?\>",'\r\n'); html = rep(html,"[ ]+",''); html = rep(html,"[\r\n]+",'\r\n'); return ncr(html); } calc_text_density = function(elm){ /* 根据公式: Ti - LTi TDi = ----------- TGi - LTGi Ti:节点 i 的字符串字数 LTi:节点 i 的所有a标签的字符串字数 TGi:节点 i 的标签数 LTGi:节点 i 的a标签数 */ var ti_text = toText(elm.innerXml()):""; var ti = #ti_text; var lti = {} var tgi = -1; var ltgi = 0; var pcount = 0; elm.enumNodes( function(parentElement,index,tagName,childCount,xNode){ if(tagName=='a'){ push(lti, xNode.innerText()); ltgi++; } if(tagName==content_tag){ pcount++; } tgi++; } ) lti = join(lti,''); lti = #lti var density if (tgi == ltgi) density=0 else density = (ti - lti) / (tgi - ltgi); return { density=density; text=ti_text; ti=ti; lti=lti; pcount=pcount; tgi=tgi; ltgi=ltgi; sbdi = calc_sbdi(ti_text, ti, lti) } } count_punctuation_num = function( text){ if(!text) return 0; var count = 0 var tab = split(text) for(i=1;#tab;1){ if(punctuation[tab[i]]){ count++ } } return count } calc_sbdi = function(text, ti, lti){ /* Ti - LTi SbDi = -------------- Sbi + 1 SbDi: 符号密度 Sbi:符号数量 */ var sbi = count_punctuation_num(text) sbdi = (ti - lti) / (sbi || 1) return sbdi || 1 } calc_standard_deviation = function(node_info){ var score_list={} for(i=1;#node_info;1){ push(score_list, node_info[i].density) } return std(score_list) } calc_new_score = function(std, node_info){ /* score = log(std) * ndi * log10(text_tag_count + 2) * log(sbdi) std:每个节点文本密度的标准差 ndi:节点 i 的文本密度 text_tag_count: 正文所在标签数。例如正文在<p></p>标签里面,这里就是 p 标签数,如果正文在<div></div>标签,这里就是 div 标签数 sbdi:节点 i 的符号密度 */ for(i=1;#node_info;1){ var info = node_info[i]; //info.score = info.density * info.sbdi * (info.text_tag_count || 1) * (info.tgi?1:0) info.score = ..math.log(std) * info.density * ..math.log10(info.pcount+2) * ..math.log(info.sbdi) } } extract_title = function(html){ html = removeTag(html,"script","style"); var doc = ..string.html(html); var elm = doc.queryEle({tagName="title"}); if(elm and #elm.innerText()) return elm.innerText(); var elm = doc.queryEle({tagName="h1"}); if(elm and #elm.innerText()) return elm.innerText(); var elm = doc.queryEle({tagName="h2"}); if(elm and #elm.innerText()) return elm.innerText(); var elm = doc.queryEle({tagName="h3"}); if(elm and #elm.innerText()) return elm.innerText(); var elm = doc.queryEle({tagName="h4"}); if(elm and #elm.innerText()) return elm.innerText(); } extract_author = function(html){ for(i=1;#authorPerlHeads;1){ var perl = authorPerlHeads[i]+authorPerlTail; var author = match(html, perl); if(author){ return author; } } } extract_pubDay = function(html){ var tab={} for(i=1;#pubdayPerls;1){ var pubday = match(html, pubdayPerls[i]) if(pubday){ push(tab, {pubday=pubday;idx=..string.find(html, pubdayPerls[i])}) //return pubday; } } if(#tab){ ..table.sort(tab,function(b){ if(owner.idx==b.idx){ return #owner.pubday>#b.pubday; } return owner.idx<b.idx; }) return tab[1].pubday; } } extract = function(html){ html = rep(html,"\<\!--.*?--\>" , ""); var author = extract_author(html); var title = extract_title(html); var pubday = extract_pubDay(html); html = match(html,"\<\s*<@@body@>[^\>]*?\s*\>.+") : html; html = removeTag(html,"head","script","style") var doc = ..string.html(html); var node_info={} var onlyTxt={} doc.enumNodes( function(parentElement,index,tagName,childCount,xNode){ if(tagName and childCount>1 and tagname!='a'){ push(node_info, calc_text_density(xNode)) } if(!tagName){ var elm = parentElement; var lv=5 var founda=0; while(elm and lv){ if(uselesstag1[elm.tagName]) { founda=1 break; } elm = elm.getParent() lv-- } if(!founda and #xNode.text>1 and count_punctuation_num(xNode.text)) push(onlyTxt, xNode.text) } } ) onlyTxt = toText(join(onlyTxt,'\r\n')) //console.dump(onlyTxt) var textlen = #onlyTxt; var std = calc_standard_deviation(node_info) calc_new_score(std, node_info) if(#onlyTxt){ for(i=1;#node_info;1){ var info = node_info[i]; var found=0 for(v in ..string.lines(info.text)){ if(v and ..string.indexOf(onlyTxt, v)){ found += #v; } } info.score1 = found/textlen; info.score2 = #info.text/textlen; info.score2 = info.score2<1.2?info.score2:0; //评分标准,自行调优 info.score = info.score1 * info.score2 //info.score = info.density * info.sbdi * (info.text_tag_count || 1) * (info.tgi?1:0) info.score = info.score * ..math.log(std) * info.density * ..math.log10(info.pcount+2) * ..math.log(info.sbdi) } } ..table.sort(node_info,function(b){ return owner.score>b.score; }) /* 看看评分情况 for(i=1;10;1){ console.dump(node_info[i]); console.dump("--------------------------------") } */ if(#node_info) return { content = node_info[1].text; title = title; author = author; pubday = pubday }; } }
示例:
import console import inet.whttp; var http = inet.whttp() import string.gne; var html = http.get("https://www.cnblogs.com/xieqiankun/p/gne_release.html") var tab = string.gne.extract(html) console.dump(tab) console.pause()
3 个回复 | 最后更新于 2021-11-29
智能文档新成员:动态文档智能模型MarkupLM
https://mp.weixin.qq.com/s/sEFUe5frk5lKSu7cXqfxgQ
源码:https://github.com/microsoft/unilm/tree/master/markuplm
有时间了尝试一下这个
登录后方可回帖
python
2.aardio
也查阅了gne提供的论文,目前还没详细看代码,测试了,还是有差距,目前不知道问题出在哪