%load_ext sql
%%sql
postgresql://postgres:postgres@localhost/hw8
%%sql
SET statement_timeout = 0;
SET lock_timeout = 0;
SET client_encoding = 'utf-8';
SET standard_conforming_strings = on;
SET check_function_bodies = false;
SET client_min_messages = warning;
%%sql
drop table if exists weibo;
CREATE TABLE weibo (
id bigint not null primary key,
weibo_id text,
daytime int,
name text,
content text,
day int,
month int,
sentiments float,
keyword0 text,
keyword1 text,
keyword2 text,
keyword3 text
);
copy weibo from 'D:\\works\\GIS\\data-analyse\\dataSets\\nCoV_total_p2.csv' CSV HEADER;
数据集为依据与“新冠肺炎”相关的230个主题关键词进行随机数据采集的2020年1月1日—2020年2月20日期间共计100万条微博数据
其中每列的含义:
%sql select * from weibo limit 5;
x = %sql select month, day, count(*) from weibo group by day,month order by month,day;
%matplotlib inline
count = len(x)
print(count)
x.bar()
该数据集共包含了从1.1至2.19共50天的数据,从图中也可以看出:
查看是否有微博用户的多篇文章在数据集中;
%%sql
select weibo_id,count(*) c from weibo group by weibo_id order by c desc limit 5;
该数据集中对于大多数用户只采集了一条或两条微博,因此无法对单一用户的多条微博进行分析;
统计所有微博中主题关键词出现的数量,按数量从高到低排序:
对处理过的content进行分词;
%%sql
select k0.keyword, k0.c+k1.c+k2.c+k3.c as c from
(select keyword0 keyword,count(*) c from weibo group by keyword0) k0,
(select keyword1 keyword,count(*) c from weibo group by keyword1) k1,
(select keyword2 keyword,count(*) c from weibo group by keyword2) k2,
(select keyword3 keyword,count(*) c from weibo group by keyword3) k3
where k0.keyword = k1.keyword and k2.keyword = k1.keyword and k2.keyword = k3.keyword and k1.keyword != '##'
order by c desc limit 10;
query = """
select k0.keyword, k0.c+k1.c+k2.c+k3.c as c from
(select keyword0 keyword,count(*) c from weibo group by keyword0) k0,
(select keyword1 keyword,count(*) c from weibo group by keyword1) k1,
(select keyword2 keyword,count(*) c from weibo group by keyword2) k2,
(select keyword3 keyword,count(*) c from weibo group by keyword3) k3
where k0.keyword = k1.keyword and k2.keyword = k1.keyword and k2.keyword = k3.keyword and k1.keyword != '##'
order by c desc limit 15;
"""
result = %sql $query
result.bar()
对所有主题关键词,取词频数为前100位生成词云图;
query = """
select k0.keyword, k0.c+k1.c+k2.c+k3.c as c from
(select keyword0 keyword,count(*) c from weibo group by keyword0) k0,
(select keyword1 keyword,count(*) c from weibo group by keyword1) k1,
(select keyword2 keyword,count(*) c from weibo group by keyword2) k2,
(select keyword3 keyword,count(*) c from weibo group by keyword3) k3
where k0.keyword = k1.keyword and k2.keyword = k1.keyword and k2.keyword = k3.keyword and k1.keyword != '##'
order by c desc limit 100;
"""
result = %sql $query
from pyecharts import options as opts
from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK
words = list(result)
c = (
WordCloud()
.add("", words, word_size_range=[20, 100], shape=SymbolType.ROUND_RECT)
.set_global_opts(title_opts=opts.TitleOpts(title='全国新型冠状病毒疫情词云图'))
)
c.render_notebook()
对微博数据进行分析,统计所有词频;
import jieba
from collections import Counter
total_data = %sql select content from weibo;
cut_words = ""
all_words = ""
i = 0
for line in total_data:
line = str(line)
i=i+1
seg_list = jieba.cut(line,cut_all=False)
cut_words = (" ".join(seg_list))
all_words += cut_words
if i%10000 == 0:
print(str(i)+' ',end='')
all_words = all_words.split()
c = Counter()
for x in all_words:
if len(x)>1 and x != '\r\n':
c[x] += 1
words = []
for (k,v) in c.most_common(5000):
# print(k, v)
words.append((k,v))
words = words[1:]
import pandas as pd
words = pd.read_csv('词频前5000.csv')
all_word = (" ".join(all_words))
print(words)
import jieba.analyse
# TF-IDF
keywords = jieba.analyse.extract_tags(all_word,
topK=50,
withWeight=True,
allowPOS=('a','e','n','nr','ns', 'v')) #词性 形容词 叹词 名词 动词
keywords = [('疫情', 0.11912294859704146), ('肺炎', 0.11055183921684136), ('全文', 0.09896922233781136), ('武汉', 0.08653435069252709), ('展开', 0.07282289241826259), ('加油', 0.060396253662755174), ('口罩', 0.057737899296916), ('确诊', 0.047917340409658435), ('新冠', 0.045060153220399156), ('病例', 0.043112362196566556), ('感染', 0.04278686859643067), ('病毒', 0.04090026875872963), ('医院', 0.03522556557504731), ('患者', 0.033405320917091134), ('视频', 0.03276271976484176), ('网页', 0.032327948806561826), ('链接', 0.031009521739956396), ('致敬', 0.025251712758224455), ('隔离', 0.02448653098084746), ('医护人员', 0.022201996864772915), ('湖北', 0.02183352867944201), ('医生', 0.020275206438533633), ('打卡', 0.01906030403713307), ('卫健委', 0.01811146889874087), ('出院', 0.017497216222247255), ('中国', 0.01702356334989813), ('大家', 0.016448027627049253), ('治愈', 0.016390183033353274), ('没有', 0.015687250330511104), ('希望', 0.015585062800449967), ('医用', 0.01423829082197198), ('发热', 0.013859673942509093), ('钟南山', 0.013779294200016197), ('转发', 0.013740735889898976), ('健康', 0.013190900738973856), ('治疗', 0.0128036104960136), ('肖战', 0.012562349511614255), ('新闻', 0.012479091615982256), ('防护', 0.012414449293612637), ('新增', 0.012277666352539497), ('酒精', 0.012173722385749347), ('物资', 0.01212093900823893), ('复工', 0.011909280661426484), ('朱一龙', 0.01159714066113227), ('全国', 0.011574160533591114), ('社区', 0.011327061602028836), ('症状', 0.011320896900746403), ('医疗队', 0.01117094804171221), ('上班', 0.011100770212804955), ('平安', 0.010739102817894648)]
import pandas as pd
import matplotlib.pyplot as plt
ss = pd.DataFrame(keywords,columns = ['词语','重要性'])
# print(ss)
import numpy as np
from matplotlib.font_manager import FontProperties
plt.figure(figsize=(10,6))
plt.title('TF-IDF Ranking')
fig = plt.axes()
plt.barh(range(len(ss.重要性[:25][::-1])),ss.重要性[:25][::-1])
fig.set_yticks(np.arange(len(ss.重要性[:25][::-1])))
font = FontProperties(fname=r'c:\windows\fonts\simsun.ttc')
fig.set_yticklabels(ss.词语[:25][::-1],fontproperties=font)
fig.set_xlabel('Importance')
查询每日疫情主题关键词数量的前10位
results = []
for d in range(1,32):
m = 1
query = """
with weibo_d as(
select * from weibo where day = %d and month = %d
)
select k0.keyword, k0.c+k1.c+k2.c+k3.c as c from
(select keyword0 keyword,count(*) c from weibo_d group by keyword0) k0,
(select keyword1 keyword,count(*) c from weibo_d group by keyword1) k1,
(select keyword2 keyword,count(*) c from weibo_d group by keyword2) k2,
(select keyword3 keyword,count(*) c from weibo_d group by keyword3) k3
where k0.keyword = k1.keyword and k2.keyword = k1.keyword and k2.keyword = k3.keyword and k1.keyword != '##'
order by c desc limit 100;
"""%(d,m)
result = %sql $query
results.append(result)
生成每日词云图(可在展板中显示,这里先展示第一天的数据):
from pyecharts import options as opts
from pyecharts.charts import WordCloud
from pyecharts.globals import SymbolType
from pyecharts.globals import CurrentConfig, NotebookType
CurrentConfig.NOTEBOOK_TYPE = NotebookType.JUPYTER_NOTEBOOK
words = list(results[0])
c = (
WordCloud()
.add("", words, word_size_range=[20, 100], shape=SymbolType.ROUND_RECT)
.set_global_opts(title_opts=opts.TitleOpts(title='全国新型冠状病毒疫情词云图'))
)
c.render_notebook()
query = """
select month||'-'||day as time,count(*) from weibo where keyword0 = '疫情' or keyword1 = '疫情' or keyword2 = '疫情' or keyword3 = '疫情' group by month,day;
"""
result = %sql $query
result.bar()
“疫情”这样的词语相对具有官方口径,是在1月20日左右随着官方承认武汉病毒存在人传人的现象的情况下开始被提及,并随之成为高频词;
query = """
select month||'-'||day as time,count(*) from weibo where keyword0 = '武汉' or keyword1 = '武汉' or keyword2 = '武汉' or keyword3 = '武汉' group by month,day;
"""
result = %sql $query
result.bar()
“武汉”在开始阶段被少量提及,在开始封城的时候达到高峰;随着疫情扩散到全国,频次缓慢下降。
统计总体情感平均值随日期的变化:
x = %sql select month, day, avg(sentiments) from weibo group by day,month order by month,day;
%matplotlib inline
x.bar()
总体而言,疫情相关的微博情感倾向还是偏正面或中性;在1.19前,情感平均值较高,约为0.2;在1.19,疫情开始在全国扩散开来并且钟南山宣布人传人之后,情感平均值下降到了0,1附近;在2.18、2.19时,可能由于疫情国内接近拐点,情感分析数值又有显著提高;
统计情感分析数值在每个0.1区间中的数量
x = %sql select round(sentiments::numeric,1),count(*) from weibo group by round(sentiments::numeric,1) order by round(sentiments::numeric,1);
%matplotlib inline
x.bar()
大部分情感分析数值集中在0.5和-0.5的区间,呈两头凸起中间凹陷型,表明存在舆论的两极分化;
根据主题词数量排名前15的主题词,查看其平均情感分析结果:
query = """
select key, avg(sentiments) from
(select k0.keyword as key, k0.c+k1.c+k2.c+k3.c as c from
(select keyword0 keyword,count(*) c from weibo group by keyword0) k0,
(select keyword1 keyword,count(*) c from weibo group by keyword1) k1,
(select keyword2 keyword,count(*) c from weibo group by keyword2) k2,
(select keyword3 keyword,count(*) c from weibo group by keyword3) k3
where k0.keyword = k1.keyword and k2.keyword = k1.keyword and k2.keyword = k3.keyword and k1.keyword != '##'
order by c desc limit 15) top, weibo
where weibo.keyword0 = key or weibo.keyword1 = key or weibo.keyword2 = key or weibo.keyword3 = key
group by key,c order by c desc;
"""
result = %sql $query
import matplotlib as plt
plt.rcParams['font.sans-serif']=['SimHei'] #显示中文标签
plt.rcParams['axes.unicode_minus']=False
result.bar()
query = """
select month||'-'||day as time,avg(sentiments) from weibo where keyword0 = '肺炎' or keyword1 = '肺炎' or keyword2 = '肺炎' or keyword3 = '肺炎' group by month,day;
"""
result = %sql $query
result.bar()
查看提及李文亮医生的微博频次
query = """
select month||'-'||day as time,count(*) from weibo where keyword0 = '李文亮' or keyword1 = '李文亮' or keyword2 = '李文亮' or keyword3 = '李文亮' group by month,day;
"""
result = %sql $query
result.bar()
对相关微博根据日期进行情感分析
query = """
select month||'-'||day as time,avg(sentiments) from weibo where keyword0 = '李文亮' or keyword1 = '李文亮' or keyword2 = '李文亮' or keyword3 = '李文亮' group by month,day;
"""
result = %sql $query
result.bar()
查看提及双黄连的微博频次
query = """
select month||'-'||day as time,count(*) from weibo where keyword0 = '双黄连' or keyword1 = '双黄连' or keyword2 = '双黄连' or keyword3 = '双黄连' group by month,day;
"""
result = %sql $query
result.bar()
随着双黄连的谣言产生,相关微博数量达到高峰;随后成逐步下降趋势。
query = """
select month||'-'||day as time,avg(sentiments) from weibo where keyword0 = '双黄连' or keyword1 = '双黄连' or keyword2 = '双黄连' or keyword3 = '双黄连' group by month,day having month = 2 or (month = 1 and day >28);
"""
%matplotlib inline
result = %sql $query
result.bar()