PHP制作百度词典查词采集器

这篇文章主要介绍了PHP制作百度词典查词采集器的相关资料,需要的朋友可以参考下

百度dict 采集样本

写的采集百度dict词典翻译后的所有结果数据,当然附带了13.5w单词库和采集简单的案例,这里我把写出的主要类dict.class.php放出来,项目地址http://github.com/widuu/baidu_dict,有需要的直接fork就可以了~么么哒,这东西用的人很少,所以有用的兄弟拿走了哈~

  音标 *				"pro"	 => 发音 *				"example"=> 例句 *				"explain"=> 简明释义 *				"synonym"=> 同反义词 *				"phrase" => 短语数组 *			) * */ public function content($word){ $this -> word = $word; $symbol = $this -> Pronounced(); $pro	 = $this->getSay(); $example = $this -> getExample(); $explain = $this -> getExplain(); $synonym = $this -> getSynonym(); $phrase = $this -> getPhrase(); $result = array( "symbol" => $symbol,		//音标 "pro"	 => $pro,			//发音 "example"=> $example,		//例句 "explain"=> $explain,		//简明释义 "synonym"=> $synonym,		//同反义词 "phrase" => $phrase 		//短语数组 ); return $result; } /** * 远程获取百度翻译内容 * get function curl * retun string * */ private function getContent(){ $useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0"; $ch = curl_init(); $url = "http://dict.baidu.com/s?wd=".$this->word; curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_USERAGENT,$useragent); curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_HTTPGET, 1); curl_setopt($ch, CURLOPT_AUTOREFERER,1); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_TIMEOUT, 30); $result = curl_exec($ch); if (curl_errno($curl)) { echo 'Errno'.curl_error($curl); } curl_close($ch); return $result; } /** * 获取百度翻译发音 * retun array(英,美) * */ private function Pronounced(){ $data = $this -> getContent(); preg_match_all("/\"EN\-US\"\>(.*)\<\/b\>/Ui",$data,$pronounced); return array( 'en' => $pronounced[1][0], 'us' => $pronounced[1][1] ); } /** * 获取百度翻译发音 * return array(英,美) * */ private function getSay(){ $data = $this -> getContent(); preg_match_all("/url=\"(.*)\"/Ui",$data,$pronounced); return array( 'en' => $pronounced[1][0], 'us' => $pronounced[1][1] ); } /** * 获取百度翻译例句 * return array() 多维数组 例句 * */ private function getExample(){ $str = ""; $data = $this -> getContent(); preg_match_all("/var example_data = (.*)\]\;/Us",$data,$example); $data1 = "[[[".ltrim($example[1][0],"["); $data2 = explode("[[[",$data1); $num = count(array_filter($data2)); foreach($data2 as $key => $value){ $data3 = explode("[[","[[".$value); foreach ($data3 as $k => $v) { preg_match_all("/\[\"(.*)\",/Us","[".$v, $match); if(!empty($match[1])){ $str .= implode($match[1]," ")."@"; } } } $data4 = trim($str,"@"); $data5 = explode("@", $data4); $result = array_chunk($data5, 2); return $result; } /** * 获取简明释义 * return array (x => "词性",b => "附属") * **/ private function getExplain(){ $data = $this -> getContent(); preg_match_all("/id\=\"en\-simple\-means\"\>(.*)\/Us",$data,$explain); $r_data = $explain[1][0]; preg_match_all("/\\(?P.*)\<\/strong\>\(?P.*)\<\/span\>\<\/p\>/Us", $r_data, $a_data); preg_match_all("/\(?P[^\>]+)\:\(?P.*)\<\/a\>\<\/span\>/Us", $r_data, $b_data); $result = array(); foreach ($a_data["adj"] as $key => $value) { $result[$value] = $a_data["name"][$key]; } $word_b = array(); foreach ($b_data["tag"] as $key => $value) { $word_b[$value] = strip_tags($b_data["word"][$key]); } $result_data = array("x" => $result,"b" => $word_b); return $result_data; } /** * 获取同义词 * return array(0 => "同义词", 1 => "反义词") 一般为多维数组 * */ private function getSynonym(){ $data = $this -> getContent(); preg_match_all("/id=\"en\-syn\-ant\"\>(.*)/Us",$data,$synonym); $content = $synonym[1][0]; $data1 = explode("", $content); $result = array(); $data2 = array(); foreach ($data1 as $key => $value) { preg_match_all("/\(?P.*)\ \;\<\/strong\>\<\/div\>\\(?.*)\<\/ul\>/Us", $value, $r_data); $data2[$key]["adj"] = $r_data["adj"]; $data2[$key]["content"] = $r_data["content"]; } foreach ($data2 as $key => $value) { foreach ($value["content"] as $k => $v) { if(!empty($v)){ preg_match_all("/\\(?P.*)\<\/p\>(?P<value>.*)\<\/li>/Us", $v, $v_data); foreach ($v_data['title'] as $m => $d) { $data = strip_tags(preg_replace("<</a>>"," ", $v_data["value"][$m])); $result[$key][$value["adj"][$k]][$d] = $data; } } } } return $result; } /** * 获取短语词组 * return array (key => value) 一维或者多维数组 * */ private function getPhrase(){ $num = self::$num; $data = $this -> getContent(); preg_match_all("/id=\"en\-phrase\"\>(.*)\<div class\=\"source\"\>/Us",$data,$phrase); $data = explode("</dd>",$phrase[1][0]); $data1 = array_slice($data,0,$num); $result = array(); foreach ($data1 as $key => $value) { $data2 = explode("</p>", $value); $n = count($data2); if($n<=3){ $result[str_replace(" ","",strip_tags($data2[0]))] = strip_tags($data2[1]); }else{ $data3 = array_slice($data2,0,$n-1); $data4 = array_slice($data2,0,2); $res = array_diff($data3,$data4); $data5 = array_chunk($res,2); $key_value = trim(str_replace(" ","",strip_tags($data4[0]))); $result[$key_value] = strip_tags($data4[1]); foreach ($data5 as $key => $value) { foreach ($value as $k => $v) { $value[$k] = strip_tags($v); } $array = array($result[$key_value],$value); if (array_key_exists($key_value, $result)){ $result[$key_value] = $array; } } } } return $result; } /** * 将数组转换为字符串 * * @param  array  $data    数组 * @param  bool  $isformdata 如果为0,则不使用new_stripslashes处理,可选参数,默认为1 * @return  string 返回字符串,如果,data为空,则返回空 */ private function array2string($data, $isformdata = 1) { if($data == '') return ''; if($isformdata) $data = $this->new_stripslashes($data); return addslashes(var_export($data, TRUE)); } /** * 返回经stripslashes处理过的字符串或数组 * @param $string 需要处理的字符串或数组 * @return mixed */ private function new_stripslashes($string) { if(!is_array($string)) return stripslashes($string); foreach($string as $key => $val) $string[$key] = $this->new_stripslashes($val); return $string; } } // $word = new dict("express"); // $word ->content();</pre></div><p>以上就是本文的全部内容了,非常实用的功能,希望小伙伴们能够喜欢。</p></div><p>以上就是PHP制作百度词典查词采集器的详细内容,更多请关注0133技术站其它相关文章!</p></article><div class="post-actions"><a href="javascript:;" etap="like" class="post-like action action-like" data-pid="0"><i
                        class="fa fa-thumbs-o-up"></i>赞(<span>0</span>)</a><a href="javascript:;"
                                                                               class="action action-rewards"
                                                                               data-event="rewards"><i
                    class="fa fa-jpy"></i> 打赏</a></div><div class="post-copyright">未经允许不得转载:<a href="/">0133技术站首页</a> » <a href="/cate153/">PHP编程</a></div><!--<div class="article-tags">标签:<a href="/archives/tag/javascript" rel="tag">JavaScript</a><a
                    href="/archives/tag/%e5%be%ae%e4%bf%a1" rel="tag">微信</a><a
                    href="/archives/tag/%e5%be%ae%e4%bf%a1%e7%9a%84%e7%89%88%e6%9c%ac%e5%8f%b7" rel="tag">微信的版本号</a><a
                    href="/archives/tag/%e7%89%88%e6%9c%ac%e5%8f%b7" rel="tag">版本号</a><a
                    href="/archives/tag/%e8%af%ad%e4%b9%89%e5%8c%96" rel="tag">语义化</a></div>--><nav class="article-nav"><span class="article-nav-prev">上一篇<br><a href="/cate153/11555934232100.html" rel="prev">PHP缓存系统APCu扩展的使用</a></span><span class="article-nav-next">下一篇<br><a href="/cate153/11555949144400.html" rel="next">thinkphp5上传图片及生成缩略图公共方法(分享)</a></span></nav><div class="relates relates-thumb"><div class="title"><h3>相关文章</h3></div><ul><li><a href="/cate153/1114368776900.html"><img data-src="https://ss.0133.cn/article/64/2a/15/642a15d53978530b59926bd274131468.jpg-160" alt="thinkPHP5使用Rabc实现权限管理" class="thumb"></a><a href="/cate153/1114368776900.html">thinkPHP5使用Rabc实现权限管理</a></li><li><a href="/cate153/1114615561025.html"><img data-src="https://ss.0133.cn/article/31/68/3d/31683d3113636bc99952c06ff8d4bfbd.jpg-160" alt="php4的session功能评述(二)" class="thumb"></a><a href="/cate153/1114615561025.html">php4的session功能评述(二)</a></li><li><a href="/cate153/1114672436900.html"><img data-src="https://ss.0133.cn/article/7c/c8/6c/7cc86ca9ff7e48b453c41ba7f1b7eb98.jpg-160" alt="php比较两个指定的日期的实例讲解" class="thumb"></a><a href="/cate153/1114672436900.html">php比较两个指定的日期的实例讲解</a></li><li><a href="/cate153/1114673648225.html"><img data-src="https://ss.0133.cn/article/7c/c8/6c/7cc86ca9ff7e48b453c41ba7f1b7eb98.jpg-160" alt="laravel多视图共享数据实例代码" class="thumb"></a><a href="/cate153/1114673648225.html">laravel多视图共享数据实例代码</a></li><li><a href="/cate153/1114674859600.html"><img data-src="https://ss.0133.cn/article/58/93/a1/5893a175f403b71a1ad32a1cbe84cb91.jpg-160" alt="图文详解laravel多对多关联模型" class="thumb"></a><a href="/cate153/1114674859600.html">图文详解laravel多对多关联模型</a></li><li><a href="/cate153/1114676071025.html"><img data-src="https://ss.0133.cn/article/58/93/a1/5893a175f403b71a1ad32a1cbe84cb91.jpg-160" alt="HTTP中header头部信息详解" class="thumb"></a><a href="/cate153/1114676071025.html">HTTP中header头部信息详解</a></li><li><a href="/cate153/1114677282500.html"><img data-src="https://ss.0133.cn/article/58/93/a1/5893a175f403b71a1ad32a1cbe84cb91.jpg-160" alt="Elasticsearch属性单词常用解析说明" class="thumb"></a><a href="/cate153/1114677282500.html">Elasticsearch属性单词常用解析说明</a></li><li><a href="/cate153/1114678494025.html"><img data-src="https://ss.0133.cn/article/58/93/a1/5893a175f403b71a1ad32a1cbe84cb91.jpg-160" alt="PHP之CI框架学习讲解" class="thumb"></a><a href="/cate153/1114678494025.html">PHP之CI框架学习讲解</a></li></ul></div><!--
            <div class="title" id="comments"><h3>评论
                    <small>抢沙发</small></h3></div><div id="respond" class="no_webshot"><form action="return false;" method="post" id="commentform"><div class="comt"><div class="comt-title"><img data-src="https://secure.gravatar.com/avatar/?s=100&d=mm" class="avatar avatar-100"
                                 height="50" width="50"><p><a id="cancel-comment-reply-link" href="javascript:;">取消</a></p></div><div class="comt-box"><textarea placeholder="你的评论可以一针见血" class="input-block-level comt-area" name="comment"
                                      id="comment" cols="100%" rows="3" tabindex="1"
                                      onkeydown="if(event.ctrlKey&&event.keyCode==13){document.getElementById('submit').click();return false};"></textarea><div class="comt-ctrl"><div class="comt-tips"><input type='hidden' name='comment_post_ID' value='6053'
                                                              id='comment_post_ID'/><input type='hidden' name='comment_parent' id='comment_parent' value='0'/><p style="display: none;"><input type="hidden" id="akismet_comment_nonce"
                                                                     name="akismet_comment_nonce" value="40dd7081eb"/></p><label for="comment_mail_notify" class="checkbox inline hide"
                                               style="padding-top:0"><input type="checkbox" name="comment_mail_notify"
                                                                            id="comment_mail_notify"
                                                                            value="comment_mail_notify"
                                                                            checked="checked"/>有人回复时邮件通知我</label><p style="display: none;"><input type="hidden" id="ak_js" name="ak_js" value="245"/></p></div><button type="button" name="submit" id="submit" tabindex="5">提交评论</button>--><!-- <span data-type="comment-insert-smilie" class="muted comt-smilie"><i class="icon-thumbs-up icon12"></i> 表情</span> --><!--
                            </div></div><div class="comt-comterinfo" id="comment-author-info"><ul><li class="form-inline"><label class="hide" for="author">昵称</label><input class="ipt" type="text" name="author" id="author" value="" tabindex="2" placeholder="昵称"><span class="text-muted">昵称 (必填)</span></li><li class="form-inline"><label class="hide" for="email">邮箱</label><input class="ipt" type="text" name="email" id="email" value="" tabindex="3" placeholder="邮箱"><span class="text-muted">邮箱 (必填)</span></li><li class="form-inline"><label class="hide" for="url">网址</label><input class="ipt" type="text" name="url" id="url" value="" tabindex="4" placeholder="网址"><span class="text-muted">网址</span></li></ul></div></div></form></div>--></div></div><div class="sidebar"><div class="widget widget_ui_tags"><h3>编程</h3><div class="items"><a href="/cate104/">Java</a><a href="/cate105/">C语言</a><a href="/cate133/">新手学堂</a><a href="/cate149/">数据库</a><a href="/cate151/">ASP编程</a><a href="/cate153/">PHP编程</a><a href="/cate154/">XML/RSS</a><a href="/cate155/">Flex</a><a href="/cate156/">正则表达式</a><a href="/cate158/">R语言</a><a href="/cate159/">汇编语言</a><a href="/cate160/">其他教程</a><a href="/cate165/">移动</a></div></div><form method="get" class="search-form clearfix" id="search-formhybrid-search" target="_blank" action="/search/article/"><div class="search-input-wrap"><input type="text" class="search-text" placeholder="来搜我" name="word" id="search-texthybrid-search"
                       value="" data-placeholder=""><b class="search-liaosheji"></b><button type="submit" class="search-button"><i class="icon-search"></i></button></div></form><div style="text-align:center;margin:0 auto;"><ul class='hot-search layui-clear'><li style="float:left;margin-right:20px;">热门搜索:</li><li style="float:left;margin-right:20px;"><a href='/search/article/?word=百度词典'>百度词典</a></li><li style="float:left;margin-right:20px;"><a href='/search/article/?word=百度歌词'>百度歌词</a></li><li style="float:left;margin-right:20px;"><a href='/search/article/?word=百度词条'>百度词条</a></li><li style="float:left;margin-right:20px;"><a href='/search/article/?word=百度分词'>百度分词</a></li><li style="float:left;margin-right:20px;"><a href='/search/article/?word=百度热词'>百度热词</a></li></ul></div><br/><div class="widget widget_ui_posts"><h3>置顶推荐</h3><ul><li><a href="/cate99/113247636454400.html"><span class="thumbnail"><img data-src="https://ss.0133.cn/upload/article/000/000/001/61c030e32e4d9281.jpg" alt="qq火花是过了24小时断吗" class="thumb"></span><span class="text">qq火花是过了24小时断吗</span><span
                            class="muted">2021-12-20</span></a></li></ul></div><div class="widget widget_recent_entries"><h3>猜你喜欢</h3><ul><li><a href="/cate153/11555762755025.html" target="_blank">php网页标题中文乱码的有效解决方法</a><span class="post-date">2021-10-08</span></li><li><a href="/cate153/11550237528400.html" target="_blank">深入掌握include_once与require_once的区别</a><span class="post-date">2021-10-08</span></li><li><a href="/cate153/1156448968100.html" target="_blank">用PHP实现弹出消息提示框的两种方法</a><span class="post-date">2021-09-13</span></li><li><a href="/cate153/1157700804100.html" target="_blank">利用PHP扩展Xhprof分析项目性能实践教程</a><span class="post-date">2021-09-13</span></li><li><a href="/cate153/11321024188100.html" target="_blank">php数据库的增删改查 php与javascript之间的交互</a><span class="post-date">2021-10-04</span></li><li><a href="/cate153/1159826674025.html" target="_blank">PHPstorm激活码2020年5月13日亲测有效</a><span class="post-date">2021-09-14</span></li><li><a href="/cate153/1160000462500.html" target="_blank">PHP中文分词的简单实现代码分享</a><span class="post-date">2021-09-14</span></li><li><a href="/cate153/11320814584025.html" target="_blank">thinkPHP5框架路由常用知识点汇总</a><span class="post-date">2021-10-04</span></li></ul></div><!--<div class="widget widget_ui_tags"><h3>热门标签</h3><div class="items"><a href="/archives/tag/javascript">JavaScript (324)</a></div></div><div class="widget widget_ui_comments"><h3>最新评论</h3><ul><li><a href="" title=""><img data-src="" class="avatar avatar-100" height="50" width="50"><strong></strong></a></li></ul></div>--></div><div id="leftbar" class="leftbar"><div class="leftbar-con"><div style="text-align: center; width: 160px;"></div></div></div></section><footer class="footer"><div class="container"><p>© 2022 <a href="/">WEB前端开发</a></p><p><a href="/cate88/">工具教程</a> | <a href="/cate71/">前端开发</a> | <a href="/cate61/">常见问题</a> | <a
                href="/cate129/">操作系统</a> | <a href="/cate103/">编程</a>  | <a href="/cate108/">网络安全</a>  | <a href="/cate120/">设计</a>  | <a href="/cate137/">站长技巧</a></p><div class="footer-gav" style="margin-bottom: 10px ;font-size: 12px;"><img
                src="https://ss.0133.cn/newimg88/2016/06/beian-gov-cn.png" style="vertical-align: middle; height: 14px;"/><a href="https://beian.miit.gov.cn/" target="_blank">鄂ICP备2021014202号-2</a></div><div class="footer-qrcode"></div></div></footer><script>    window.jsui = {
        www: '',
        uri: '/wp-content/themes/dux',
        ver: '5.2.5',
        roll: ["1", "2"],
        ajaxpager: '0',
        url_rp: '/about'
    };
</script><script type='text/javascript' src='/wp-content/themes/dux/js/libs/bootstrap.min.js?ver=5.2.5'></script><script type='text/javascript' src='/wp-content/themes/dux/js/loader.js?ver=5.2.5'></script><script type='text/javascript' src='/wp-includes/js/wp-embed.min.js?ver=5.0.2'></script><script async="async" type='text/javascript' src='/wp-content/plugins/akismet/_inc/form.js?ver=4.0.8'></script><script type="text/javascript">    var $win = jQuery(window);

    var $WindowW = $win.width();

    var $threebar = jQuery("#leftbar");
    if ($threebar.length) {
        var offsetT = $threebar.offset().top;
        $win.on("scroll.leftbar", function () {
            if ($WindowW >= 1200) {
                if ($win.scrollTop() >= (offsetT)) {
                    $threebar.find(".leftbar-con").css({
                        "position": "fixed",
                        "top": 0
                    });
                } else {
                    $threebar.find(".leftbar-con").css("position", "static")
                }
            }
        });
    }

    // Array
    var imagesArr = [];
    $('#article-content').find("img").each(function (index, Element) {
        imagesArr.push(Element)
    });
    mediumZoom(imagesArr, {
        margin: 24,
        background: '#FFFFFF',
        scrollOffset: 0
    })

</script><script>    (function (i, s, o, g, r, a, m) {
        i['GoogleAnalyticsObject'] = r;
        i[r] = i[r] || function () {
                    (i[r].q = i[r].q || []).push(arguments)
                }, i[r].l = 1 * new Date();
        a = s.createElement(o),
                m = s.getElementsByTagName(o)[0];
        a.async = 1;
        a.src = g;
        m.parentNode.insertBefore(a, m)
    })(window, document, 'script', '//www.google-analytics.com/analytics.js', 'ga');

    ga('create', 'UA-3448069-1', 'css88.com');
    ga('send', 'pageview');

</script><script>    var _hmt = _hmt || [];
    (function () {
        var hm = document.createElement("script");
        hm.src = "https://hm.baidu.com/hm.js?08985692e4db1e8be1cff8097c2979b7";
        var s = document.getElementsByTagName("script")[0];
        s.parentNode.insertBefore(hm, s);
    })();
</script><script>    (function () {
        var bp = document.createElement('script');
        var curProtocol = window.location.protocol.split(':')[0];
        if (curProtocol === 'https') {
            bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
        }
        else {
            bp.src = 'http://push.zhanzhang.baidu.com/push.js';
        }
        var s = document.getElementsByTagName("script")[0];
        s.parentNode.insertBefore(bp, s);
    })();
</script></body></html>