admin 发布的文章

1.解决wordcloud乱码:

w = wordcloud.WordCloud(width=1000,\
    font_path="/System/Library/Fonts/PingFang.ttc",\
    height=700)     #这里的字体路径需要搜索ttc拷贝

2.matplotlib中文乱码:

plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] 
matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS']

3.完成可视化代码:

import matplotlib.pyplot as plt
import matplotlib
import jieba
import jieba.analyse
import xlwt
import xlrd
from wordcloud import WordCloud
import numpy as np
from collections import Counter
# 设置字体 有的linux字体有问题
matplotlib.rcParams['font.sans-serif'] = ['Arial Unicode MS']
matplotlib.rcParams['axes.unicode_minus'] = False


# 类似comment 为评论的一些数据 [  ['1','名称','star星','赞同数','评论内容']  ,['2','名称','star星','赞同数','评论内容'] ]元组
def anylasescore(comment):
    score = [0, 0, 0, 0, 0, 0]  # 分别对应0 1 2 3 4 5分出现的次数
    count = 0  # 评分总次数
    for va in comment:  # 遍历每条评论的数据  ['1','名称','star星','赞同数','评论内容']
        try:
            score[int(va[2])] += 1  # 第3列 为star星 要强制转换成int格式
            count += 1
        except Exception as e:
            continue
    print(score)
    label = '1分', '2分', '3分', '4分', '5分'
    color = 'blue', 'orange', 'yellow', 'green', 'red'  # 各类别颜色
    size = [0, 0, 0, 0, 0]  # 一个百分比数字 合起来为100
    explode = [0, 0, 0, 0, 0]  # explode :(每一块)离开中心距离;
    for i in range(1, 5):  # 计算
        size[i] = score[i] * 100 / count
        explode[i] = score[i] / count / 10
    pie = plt.pie(size, colors=color, explode=explode, labels=label, shadow=True, autopct='%1.1f%%')
    for font in pie[1]:
        font.set_size(8)
    for digit in pie[2]:
        digit.set_size(8)
    plt.axis('equal')  # 该行代码使饼图长宽相等
    plt.title(u'各个评分占比', fontsize=12)  # 标题
    plt.legend(loc=0, bbox_to_anchor=(0.82, 1))  # 图例
    # 设置legend的字体大小
    leg = plt.gca().get_legend()
    ltext = leg.get_texts()
    plt.setp(ltext, fontsize=6)
    plt.savefig("score.png")
    # 显示图
    plt.show()


def getzhifang(map):  # 直方图二维,需要x和y两个坐标
    x = []
    y = []
    for k, v in map.most_common(15):  # 获取前15个最大数值
        x.append(k)
        y.append(v)
    Xi = np.array(x)  # 转成numpy的坐标
    Yi = np.array(y)

    width = 0.6
    plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']  # 用来正常显示中文标签
    plt.figure(figsize=(8, 6))  # 指定图像比例: 8:6
    plt.bar(Xi, Yi, width, color='blue', label='热门词频统计', alpha=0.8, )

    plt.xlabel("词频")
    plt.ylabel("次数")
    plt.savefig('zhifang.png')
    plt.show()
    return


def getciyun_most(map):  # 获取词云
    # 一个存对应中文单词,一个存对应次数
    x = []
    y = []
    for k, v in map.most_common(300):  # 在前300个常用词语中
        x.append(k)
        y.append(v)
    xi = x[0:150]  # 截取前150个
    xi = ' '.join(xi)  # 以空格 ` `将其分割为固定格式(词云需要)
    print(xi)
    # backgroud_Image = plt.imread('')  # 如果需要个性化词云
    # 词云大小,字体等基本设置
    wc = WordCloud(background_color="white",
                   width=1500, height=1200,
                   # min_font_size=40,
                   # mask=backgroud_Image,
                   font_path="/System/Library/Fonts/PingFang.ttc",
                   max_font_size=150,  # 设置字体最大值
                   random_state=50,  # 设置有多少种随机生成状态,即有多少种配色方案
                   )  # 字体这里有个坑,一定要设这个参数。否则会显示一堆小方框wc.font_path="simhei.ttf"   # 黑体
    # wc.font_path="simhei.ttf"
    my_wordcloud = wc.generate(xi)  #需要放入词云的单词 ,这里前150个单词
    plt.imshow(my_wordcloud)  # 展示
    my_wordcloud.to_file("img.jpg")  # 保存
    xi = ' '.join(x[150:300])  # 再次获取后150个单词再保存一张词云
    my_wordcloud = wc.generate(xi)
    my_wordcloud.to_file("img2.jpg")

    plt.axis("off")


def anylaseword(comment):
    # 这个过滤词,有些词语没意义需要过滤掉
    list = ['这个', '一个', '不少', '起来', '没有', '就是', '不是', '那个', '还是', '剧情', '这样', '那样', '这种', '那种', '故事', '人物', '什么']
    print(list)
    commnetstr = ''  # 评论的字符串
    c = Counter()  # python一种数据集合,用来存储字典
    index = 0
    for va in comment:
        seg_list = jieba.cut(va[4], cut_all=False)  ## jieba分词
        index += 1
        for x in seg_list:
            if len(x) > 1 and x != '\r\n':  # 不是单个字 并且不是特殊符号
                try:
                    c[x] += 1  # 这个单词的次数加一
                except:
                    continue
        commnetstr += va[4]
    for (k, v) in c.most_common():  # 过滤掉次数小于5的单词
        if v < 5 or k in list:
            c.pop(k)
            continue
        # print(k,v)
    print(len(c), c)
    getzhifang(c)  # 用这个数据进行画直方图
    getciyun_most(c)  # 词云
    # print(commnetstr)


def anylase():
    data = xlrd.open_workbook('comments.xls')  # 打开xls文件
    table = data.sheets()[0]  # 打开第i张表
    nrows = table.nrows  # 若干列的一个集合
    comment = []

    for i in range(nrows):
        comment.append(table.row_values(i))  # 将该列数据添加到元组中
    # print(comment)
    anylasescore(comment)
    anylaseword(comment)


if __name__ == '__main__':
    anylase()

4.完整爬虫代码:

import requests
from bs4 import BeautifulSoup
import urllib.parse

import xlwt
import xlrd

# 账号密码
def login(username, password):
    url = 'https://accounts.douban.com/j/mobile/login/basic'
    header = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
        'Referer': 'https://accounts.douban.com/passport/login_popup?login_source=anony',
        'Origin': 'https://accounts.douban.com',
        'content-Type': 'application/x-www-form-urlencoded',
        'x-requested-with': 'XMLHttpRequest',
        'accept': 'application/json',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'zh-CN,zh;q=0.9',
        'connection': 'keep-alive'
        , 'Host': 'accounts.douban.com'
    }
    # 登陆需要携带的参数
    data = {
        'ck' : '',
        'name': '',
        'password': '',
        'remember': 'false',
        'ticket': ''
    }
    data['name'] = username
    data['password'] = password
    data = urllib.parse.urlencode(data)
    print(data)
    req = requests.post(url, headers=header, data=data, verify=False)
    cookies = requests.utils.dict_from_cookiejar(req.cookies)
    print(cookies)
    return cookies

def getcomment(cookies, mvid):  # 参数为登录成功的cookies(后台可通过cookies识别用户,电影的id)
    start = 0
    w = xlwt.Workbook(encoding='ascii')  # #创建可写的workbook对象
    ws = w.add_sheet('sheet1')  # 创建工作表sheet
    index = 1  # 表示行的意思,在xls文件中写入对应的行数
    while True:
        # 模拟浏览器头发送请求
        header = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36',
        }
        # try catch 尝试,一旦有错误说明执行完成,没错误继续进行
        try:
            # 拼凑url 每次star加20
            url = 'https://movie.douban.com/subject/' + str(mvid) + '/comments?start=' + str(
                start) + '&limit=20&sort=new_score&status=P&comments_only=1'
            start += 20
            # 发送请求
            req = requests.get(url, cookies=cookies, headers=header)
            # 返回的结果是个json字符串 通过req.json()方法获取数据
            res = req.json()
            res = res['html']  # 需要的数据在`html`键下
            soup = BeautifulSoup(res, 'lxml')  # 把这个结构化html创建一个BeautifulSoup对象用来提取信息
            node = soup.select('.comment-item')  # 每组class 均为comment-item  这样分成20条记录(每个url有20个评论)
            for va in node:  # 遍历评论
                name = va.a.get('title')  # 获取评论者名称
                star = va.select_one('.comment-info').select('span')[1].get('class')[0][-2]  # 星数好评
                votes = va.select_one('.votes').text  # 投票数
                comment = va.select_one('.short').text  # 评论文本
                print(name, star, votes, comment)
                ws.write(index, 0, index)  # 第index行,第0列写入 index
                ws.write(index, 1, name)  # 第index行,第1列写入 评论者
                ws.write(index, 2, star)  # 第index行,第2列写入 评星
                ws.write(index, 3, votes)  # 第index行,第3列写入 投票数
                ws.write(index, 4, comment)  # 第index行,第4列写入 评论内容
                index += 1
        except Exception as e:  # 有异常退出
            print(e)
            break
    w.save('test.xls')  # 保存为test.xls文件


if __name__ == '__main__':
    username = input('输入账号:')
    password = input('输入密码:')
    cookies = login(username, password)
    mvid = input('电影的id为:')
    getcomment(cookies, mvid)

5.参考文章:
https://juejin.im/post/6886997083052572686
https://blog.csdn.net/qq_32590631/article/details/80509741
https://blog.csdn.net/wlher/article/details/98186741

import csv
import os
import shutil
from chardet.universaldetector import UniversalDetector

def get_encode_info(file):
    with open(file, 'rb') as f:
        detector = UniversalDetector()
        for line in f.readlines():
            detector.feed(line)
            if detector.done:
                break
        detector.close()
        return detector.result['encoding']

def read_file(file):
    with open(file, 'rb') as f:
        return f.read()

def write_file(content, file):
    with open(file, 'wb') as f:
        f.write(content)

def convert_encode2utf8(file, original_encode, des_encode):
    file_content = read_file(file)
    file_decode = file_content.decode(original_encode,'ignore')
    file_encode = file_decode.encode(des_encode)
    write_file(file_encode, file)

## Move *.txt to a folder
def move2txtfolder(path, txt_file_list):
    txt_folder_path = path + '\\txt'
    if not os.path.exists(txt_folder_path):
        os.makedirs(txt_folder_path)

    for file in txt_file_list:
        des_path = os.path.join(txt_folder_path, os.path.basename(file))
        shutil.move(file, des_path)

##在路径中找出所有的*.txt文件
def findtxt(path, txt_file_list):
    file_name_list = os.listdir(path)
    for filename in file_name_list:
        de_path = os.path.join(path, filename)
        if os.path.isfile(de_path):
            if de_path.endswith(".txt"):  # Specify to find the txt file.
                txt_file_list.append(de_path)
        else:
            findtxt(de_path, txt_file_list)

def txt2csv(txt_file):
    ##先把所有文件的encoding都转换成utf-8
    encode_info = get_encode_info(txt_file)
    if encode_info != 'utf-8':
        convert_encode2utf8(txt_file, encode_info, 'utf-8')

    csv_file = os.path.splitext(txt_file)[0] + '.csv'
    with open(csv_file, 'w+', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile, dialect='excel')

        with open(txt_file, 'r', encoding='utf-8') as txtfile:
            for line in txtfile.readlines():
                line_list = line.strip('\n').split(';')
                writer.writerow(line_list)

if __name__ == '__main__':
    folder_path = r'C:\Details'
    # ##如果文件夹中还有子文件夹,请用findtxt函数
    # txt_file_list = []
    # findtxt(folder_path, txt_file_list)

    ##如果文件夹中没有子文件夹的时候直接使用推导式来生产txt文件的list
    txt_file_list = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if os.path.join(folder_path, file).endswith('.txt')]

    for txt_file in txt_file_list:
        txt2csv(txt_file)
    
    move2txtfolder(folder_path, txt_file_list)

1.读取文件夹中所有txt文件,保存到list中;
2.针对每个txt文件,自动生产同文件名的csv文件;
3.对每个txt文件,根据分隔符来保存为csv文件,分隔符为分号“;”,在转换之前先把文件编码统一成'utf-8',因为在实现过程中,发现总会有编码报错问题出现;
4.新建txt文件夹来存放所有txt文件。

参考文章:
https://www.cnblogs.com/danvy/p/11667763.html

目前英文词性分类主流方法和工具为:
1.中文普通任务直接用分词工具就够了,jieba、Hanlp、Ansj、Standfordnlp;
2.自己准备语料,HMM、CRF,现在神经网络训练的话一般就是CRF++工具(可以构建特征)、CNN-LSTM(简单些)、Bi-LSTM+CRF(用得最多)、或者是BERT+BiLSTM+CRF(新)。
我自己尝试了Stanfordnlp和spacy,可以使用。

import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp(u'This is a sentence .')
for token in doc:
    print(token, token.pos_, token.pos)
for nounc in doc.noun_chunks:
    print(nounc)
#with open('demo.txt', mode='r', encoding='UTF-8') as f: 
def getFileContent(path):
    with open(path,'r') as f:
        return f.read()
demo = nlp(getFileContent('./demo.txt'))
#    demo = nlp(’f‘)
for nounc in demo.noun_chunks:
    print(nounc)

or

file = open("demo.txt", "r")
doc = nlp(file.read())
for np in doc.noun_chunks:
    print(np.text)

or

import spacy

nlp = spacy.load("en")

doc = nlp("We try to explicitly describe the geometry of the edges of the images.")

for np in doc.noun_chunks: # use np instead of np.text
    print(np)

print()

# code to recursively combine nouns
# 'We' is actually a pronoun but included in your question
# hence the token.pos_ == "PRON" part in the last if statement
# suggest you extract PRON separately like the noun-chunks above

index = 0
nounIndices = []
for token in doc:
    # print(token.text, token.pos_, token.dep_, token.head.text)
    if token.pos_ == 'NOUN':
        nounIndices.append(index)
    index = index + 1


print(nounIndices)
for idxValue in nounIndices:
    doc = nlp("We try to explicitly describe the geometry of the edges of the images.")
    span = doc[doc[idxValue].left_edge.i : doc[idxValue].right_edge.i+1]
    span.merge()

    for token in doc:
        if token.dep_ == 'dobj' or token.dep_ == 'pobj' or token.pos_ == "PRON":
            print(token.text)

or

from stanfordcorenlp import StanfordCoreNLP
import nltk
from nltk.tree import Tree as nltkTree

nlp = StanfordCoreNLP('./StanfordNLP工具包/stanford-corenlp-full-2018-02-27') 

sentence = 'person removes plate out of cabinet' #输入句子

sen_tag = nlp.pos_tag(sentence)  #词性标注
print(sen_tag)
noun_word = []
for i in range(len(sen_tag)):
    if sen_tag[i][1] == 'NN':
        noun_word.append(sen_tag[i][0])
print(noun_word)

参考文章:
https://blog.csdn.net/lsp1991/article/details/22733619
https://my.oschina.net/u/4232146/blog/4626208
http://www.manongjc.com/article/40737.html
https://spacy.io/models
https://zhuanlan.zhihu.com/p/51425975
https://www.it1352.com/1683784.html
http://www.mamicode.com/info-detail-2686258.html
http://stanfordnlp.github.io/CoreNLP/
https://github.com/stanfordnlp/CoreNLP
https://www.zhihu.com/question/29940957
https://www.jianshu.com/p/931c49830754
https://www.cnblogs.com/maoerbao/p/13019276.html
https://www.it610.com/article/1188489230905614336.htm
https://www.it610.com/article/1188489230905614336.htm
https://zhuanlan.zhihu.com/p/137226095
https://blog.csdn.net/weixin_34613450/article/details/84317158
https://zhuanlan.zhihu.com/p/44180488

最初,是想通过python加载进行转换,使用langconv.py进行转换,注意编码notepad++设置为utf8,并按照utf8读取,但效率低且读入列表错误。
随后,使用opencc进行转换,首先使用python加载,但列表载入太慢,紧接着使用winx64版本进行转换,使用.bat进行批处理。
.bat路径为 \OpenCC\build\bin\ , opencc -i 待处理文件 -o 处理后文件 json文件路径。

opencc -i 03dianxi.txt -o 003dianxi.txt -c C:\Users\Administrator\Desktop\fjt\OpenCC\build\share\opencc\t2s.json
opencc -i 06chaoxiaochu.txt -o 006chaoxiaochu.txt -c C:\Users\Administrator\Desktop\fjt\OpenCC\build\share\opencc\t2s.json
opencc -i 07xiaorong.txt -o 007xiaorong.txt -c C:\Users\Administrator\Desktop\fjt\OpenCC\build\share\opencc\t2s.json
opencc -i 08ermi.txt -o 008ermi.txt -c C:\Users\Administrator\Desktop\fjt\OpenCC\build\share\opencc\t2s.json
opencc -i 09shanbei.txt -o 009shanbei.txt -c C:\Users\Administrator\Desktop\fjt\OpenCC\build\share\opencc\t2s.json
opencc -i 10linmeimei.txt -o 010linmeimei.txt -c C:\Users\Administrator\Desktop\fjt\OpenCC\build\share\opencc\t2s.json
opencc -i 11yeshi.txt -o 011yeshi.txt -c C:\Users\Administrator\Desktop\fjt\OpenCC\build\share\opencc\t2s.json
opencc -i 12martin.txt -o 012martin.txt -c C:\Users\Administrator\Desktop\fjt\OpenCC\build\share\opencc\t2s.json
opencc -i 13m.txt -o 013m.txt -c C:\Users\Administrator\Desktop\fjt\OpenCC\build\share\opencc\t2s.json
opencc -i 14wuage.txt -o 014wuage.txt -c C:\Users\Administrator\Desktop\fjt\OpenCC\build\share\opencc\t2s.json
opencc -i 15meiweixiaoshe.txt -o 015meiweixiaoshe.txt -c C:\Users\Administrator\Desktop\fjt\OpenCC\build\share\opencc\t2s.json
opencc -i 16dashidecai.txt -o 016dashidecai.txt -c C:\Users\Administrator\Desktop\fjt\OpenCC\build\share\opencc\t2s.json
opencc -i 17xiaoyingmeishi.txt -o 017xiaoyingmeishi.txt -c C:\Users\Administrator\Desktop\fjt\OpenCC\build\share\opencc\t2s.json
opencc -i 18shanyaocun.txt -o 018shanyancun.txt -c C:\Users\Administrator\Desktop\fjt\OpenCC\build\share\opencc\t2s.json

参考文章:
https://blog.csdn.net/lotusws/article/details/82934599
https://github.com/skydark/nstools
https://www.jb51.net/article/152940.htm
https://blog.csdn.net/wds2006sdo/article/details/53583367
https://blog.csdn.net/dongfuguo/article/details/89709626
https://www.cnblogs.com/qingchengzi/articles/10602109.html
https://zhuanlan.zhihu.com/p/55973055
https://blog.csdn.net/sinat_29957455/article/details/81290356
https://github.com/BYVoid/OpenCC
https://www.pianshen.com/article/446283828/
https://github.com/Lchiffon/ropencc

遇到No input file specified问题,解决过程如下:
1.首先检查php文件,修改php文件的mbstring设置,无效;bt面板mbstring默认已开启。
2.尝试关闭bt面板跨站保护,(如删除.user.ini,删除.htaccess)效果不明朗;
2.接着检查NGINX设置,对//www/server/nginx/conf/fastcgi.conf作出如下修改,直接进行注释:

#fastcgi_param  PHP_ADMIN_VALUE    "$bt_safe_dir=$bt_safe_open";

使用2,重启NGINX生效,不再报出no input file specified错误。