使用的BeautifulSoup库,比较小白,对于中文,有的页面需要添加from_encoding = "GBK", 还有数据库的各种转码,比如con.escape_string(),这个问题卡了一下午+一晚上,还是在StackOverFlow上撞到的解决方案, 以腾讯新闻为例:
from_encoding = "GBK"
con.escape_string()
# coding: utf-8 # 文章信息保存到details_list中,包括标题,作者,发布时间,摘要,内容, 原地址 # /usr/bin/python # author: fish import sys reload(sys) sys.setdefaultencoding('utf-8') from bs4 import BeautifulSoup import urllib2 import urllib import socket import re import MySQLdb url = 'http://tech.qq.com' socket.setdefaulttimeout(200) soup = BeautifulSoup(urllib.urlopen(url), from_encoding = 'GBK') #print soup href_list = [] title = soup.find_all('div', 'Q-tpListInner') #print title for detail_href in title: try: href_list.append(detail_href.a.get('href')) except: AttributeError details_list = [] #print href_list '''connect to mysql''' try: con = MySQLdb.connect(host = 'localhost', user = 'root', passwd = 'baidusql', charset = 'utf8') con.select_db('ali_app') cur = con.cursor() for href in href_list: sub_details_list = [] detail_soup = BeautifulSoup(urllib.urlopen(href).read(), from_encoding = 'GBK') print href try: article_title = detail_soup.find('div', 'hd').h1.string article_pub_date = detail_soup.find('span', 'pubTime').string article_author = detail_soup.find('span', 'auth').string if str(article_author) == 0: article_author = '腾讯科技' article_abridgement = detail_soup.find(attrs = { 'name': 'Description' }).get('content') article_contents = detail_soup.find('div', id = 'Cnt-Main-Article-QQ') article_source_address = href if(str(article_contents).find('videoPlayer') != -1 or str(detail_soup).find('gqMaskshowBT') != -1): continue except: AttributeError sub_details_list.append(article_title) sub_details_list.append(article_author) sub_details_list.append(article_pub_date) sub_details_list.append(article_source_address) sub_details_list.append(article_abridgement) sub_details_list.append(article_contents) # cur.execute('drop table if exists QQ_TECH') # cur.execute("insert into QQ_TECH(title, autor, pub_date, source_address, description, content) values(%s, %s, %s, %s, %s, %s)", sub_details_list) # cur.execute("insert into QQ_TECH(title, autor ,pub_date, source_address, description, content ) values('fdsaf', 'fds', '发范德萨', 'fdsafdfdksajfwefdsdsa放到', 'fdsafewr3', '范德萨范德萨定时分尸案')") # print sub_details_list cur.execute('delete from jr_category where source_address = "%s"' %article_source_address) cur.execute('delete from jr_content where source_address = "%s"' %article_source_address) cur.execute('insert into jr_category(sid, title, date, author, source_address) values(3, "%s", "%s", "%s", "%s")' %(article_title, article_pub_date, article_author, article_source_address)) article_contents = con.escape_string(str(article_contents)) cur.execute("insert into jr_content(source_address, description, content) values('%s', '%s', '%s')" %(article_source_address, article_abridgement, article_contents)) # content = con.escape_string(str(article_contents)) # cur.execute('''insert into test2(name) values("%s")''' %content) details_list.append(sub_details_list) con.commit() cur.close() con.close() except MySQLdb.Error, e: print "Error %d: %s" %(e.args[0], e.args[1])