这是我在知乎专栏发表的那篇知乎用户排行榜生成器0.2版的源码,为增强专栏的可读性所以在那边只放了压缩版的。这里是完整版,有兴趣者可以自行调试修改。
这段脚本需要登录知乎网站后在浏览器控制台运行,更详细的用法请点上面链接查看。
2 | var userarray = "guxizhao,zou-dao-kou,xiaodaoren,cai-tong,xu-xiang-nan,unogzx,shenbin,PeterDeng,namiheike,wu-si-yang-32,yskin,jixin" ; |
10 | var followerlimit = 10; |
12 | var users = userarray.split( ',' ); |
14 | var result = new Array(); |
18 | function showmsg(msg) { $( "#msg" ).html(msg); } |
19 | function showresult() { |
20 | $( "#switchshowtable" ).show(0); |
21 | $( "#sorttype" ).show(0); |
22 | var rsdiv = $( "#result" ); |
24 | var tablehtm = "<table border='1' cellpadding='2' style='border-collapse: collapse;'><tr><td>编号</td><td>用户名</td><td>关注者</td><td>提问</td><td>回答</td><td>赞同</td><td>赞同/回答比</td></tr>" ; |
26 | tablehtm += "<tr><td>" + (parseInt(i) + 1) + "</td><td><a href='/people/" + result[i].id + "/' target='_blank'>" + result[i].name + "</a></td><td>" + result[i].follower + "</td><td>" + result[i].ask + "</td><td>" + result[i].answer + "</td><td>" + result[i].agree + "</td><td>" + result[i].ratio + "</td></tr>" ; |
28 | tablehtm += "</table>" ; |
32 | rsdiv.html( "编号,用户名,关注者,提问,回答,赞同,赞同/回答比" ); |
34 | rsdiv.append( "<br/>" + (parseInt(i) + 1) + ",<a href='/people/" + result[i].id + "/' target='_blank'>" + result[i].name + "</a>," + result[i].follower + "," + result[i].ask + "," + result[i].answer + "," + result[i].agree + "," + result[i].ratio); |
40 | var content = $( "#tempframe" ).contents(); |
41 | var name = content.find( ".title-section.ellipsis a" ).html(); |
42 | if (content.find( '.zu-button-more[aria-role]' ).length < 1) { |
43 | showmsg(name + "的" + cardcount + "个关注者加载完成" ); |
47 | content.find( '.zu-button-more[aria-role]' ).get(0).click(); |
48 | var total = content.find( ".zm-profile-side-following strong" ).html(); |
49 | cardcount = content.find( '.zh-general-list .zm-profile-card .zm-list-content-medium' ).length; |
50 | showmsg( "正在加载" + name + "的关注者:" + cardcount + "/" + total + "... <img style='vertical-align: text-bottom;' src='http://static.zhihu.com/static/img/spinner/grey-loading.gif'/>" ); |
51 | setTimeout(loadmore, 2000); |
56 | var cards = $( "#tempframe" ).contents().find( '.zh-general-list .zm-profile-card .zm-list-content-medium' ); |
57 | cards.each( function () { |
58 | var name = $( this ).find( 'a.zg-link' ).html(); |
59 | var id = $( this ).find( 'a.zg-link' ).attr( "href" ).replace( "http://www.zhihu.com/people/" , "" ); |
60 | var detail = $( this ).find( '.details' ); |
61 | var follower = Number(detail.eq(0).children().eq(0).html().split( ' ' )[0]); |
62 | var ask = Number(detail.eq(0).children().eq(1).html().split( ' ' )[0]); |
63 | var answer = Number(detail.eq(0).children().eq(2).html().split( ' ' )[0]); |
64 | var agree = Number(detail.eq(0).children().eq(3).html().split( ' ' )[0]); |
65 | if (answer >= answerlimit && agree >= agreelimit && agree / answer >= ratiolimit && follower > followerlimit) { |
69 | r.follower = follower; |
73 | r.ratio = (agree / answer).toFixed(2); |
84 | if (usercursor < users.length) { |
85 | showmsg( "共" + users.length + "个用户,准备扫描第" + (usercursor + 1) + "个... <img style='vertical-align: text-bottom;' src='http://static.zhihu.com/static/img/spinner/grey-loading.gif'/>" ); |
86 | $( "#tempframe" ).attr( "src" , "/people/" + users[usercursor] + "/followees" ); |
89 | showmsg( "所有" + users.length + "名用户的关注者已经全部扫描完成,共找到" + result.length + "个符合条件的用户" ); |
93 | function addresult(r) { |
95 | for (i in result) { if (r.id == result[i].id) { exist = true ; break ; } } |
96 | if (!exist) result.push(r); |
99 | function sortresult() { |
100 | if (result.length > 0) { |
101 | var type = $( "#sorttype" ).val(); |
104 | result = result.sort( function (a, b) { return b.ratio - a.ratio; }); |
107 | result = result.sort( function (a, b) { return b.agree - a.agree; }); |
110 | result = result.sort( function (a, b) { return b.answer - a.answer; }); |
113 | result = result.sort( function (a, b) { return b.ask - a.ask; }); |
116 | result = result.sort( function (a, b) { return b.follower - a.follower; }); |
124 | $( "body" ).prepend( '<div id="mask" style="width:100%;height:100%;top:0px;left:0px;position:fixed;z-index: 998;background-color: rgba(0, 0, 0, 0.4);text-align:center;"><div id="container" style="width:600px;height:400px;margin:80px auto 0px auto;position: relative;z-index: 999; padding: 5px;"><iframe id="tempframe" style="width:1px;height:1px;top:-999px;left:-999px;position:absolute;"></iframe><div id="msg" style="height: 30px;background-color: #C4D299;line-height: 30px;text-align: left;padding-left: 5px;"></div><div id="result" style="height: 350px;background-color: #F0F0F0;text-align: left;padding: 5px;margin-top: 5px;overflow-y: auto;"></div><input id="switchshowtable" style="display:none;position: absolute;width: 100px;top: 10px;right: 25px;" type="button" value="改为逗号分隔"/><select id="sorttype" name="sorttype" style="display:none;position: absolute;width: 100px;top: 45px;right: 25px;"><option value="ratio" selected>赞同/回答比</option><option value="agree">赞同</option><option value="answer">回答</option><option value="ask">提问</option><option value="follower">关注</option></select></div></div>' ); |
125 | $( "#switchshowtable" ).click( function () { showtable = !showtable; $( this ).val(showtable ? "改为逗号分隔" : "改为表格显示" ); showresult(); }); |
126 | $( "#sorttype" ).change( function () { sortresult(); showresult(); }); |
127 | $( "#tempframe" ).load( function () { loadmore(); }); |
请问楼主这个抓取的结果如何保存为文本文件呢? 我试过用Scrapy抓取,不过还没解决cookie和”更多”的问题 > <
这个是网页js脚本,当然不存在cookie的问题了,而“更多”是找到按钮的dom对象然后直接click实现的,也没有保存为文本,而是写到页面div里。
我不懂Python,你先解决cookie,然后抓包看“更多”访问的是哪个url、带哪些参数就行了。
嗯、谢谢提供思路。我们已在你的代码基础上做了一点改动,可以抓到不少信息了。不过为了性能以及灵活性,还是得把Python的问题解决才行。
楼主您好,我最近在学习js,您的loadmore函数中。title-section.ellipsis .zu-button-more[aria-role] 我没有在知乎网页源代码中找到,请问这是如何获得的呢?谢谢。
这段脚本是两个月前写的,最近知乎的网页有改版,一些元素的位置和名称变了。
title-section.ellipsis是为了找用户名,.zu-button-more是为了找“更多”按钮。你可以自己来找一下。
谢谢楼主,我明白了。