一些小问题

xiaoxiao2021-02-27  153

1,如何在 PyCharm 中设置 Python 代码模板


2,PyCharm 中文注释报错 SyntaxError: Non-ASCII character


3,Ubuntu 安装Navicat,界面出现乱码解决方法


4,Ubuntu navicat导入csv文件失败:多半是字段分隔符按照默认的设定成了“定位”,改成逗号(或者换其他几个选项试试)


5,Gensim Word2vec 使用指南


6,自然语言处理工具包spaCy介绍


7,AttributeError: ‘Word2Vec’ object has no attribute ‘syn0’


8,SQLAlchemy Introduce(mysql与它的数据类型对应问题)


9,keras: texts_to_sequences_generator(texts)

from keras.preprocessing.text import Tokenizer texts=data.x_train sample_index=0 text_list = texts[sample_index][0] # 这是一个句子列表,里面是unicode tokenizer = Tokenizer(word_num_per_sent) tokenizer.fit_on_texts(text_list)

报错为:

File "/home/sunxiangguo/PycharmProjects/personality/cnn.py", line 85, in <module> tokenizer.fit_on_texts(text_list) File "/home/sunxiangguo/anaconda2/lib/python2.7/site-packages/keras/preprocessing/text.py", line 119, in fit_on_texts self.split) File "/home/sunxiangguo/anaconda2/lib/python2.7/site-packages/keras/preprocessing/text.py", line 38, in text_to_word_sequence text = text.translate(maketrans(filters, split * len(filters))) TypeError: character mapping must return integer, None or unicode

修正:

from keras.preprocessing.text import Tokenizer texts=data.x_train sample_index=0 text_list = texts[sample_index][0] # 这是一个句子列表,里面是unicode tokenizer = Tokenizer(word_num_per_sent) tokenizer.fit_on_texts([s.encode('ascii') for s in text_list]) #tokenizer.fit_on_texts(text_list)

10,编写一个集读取数据库,数据分割与一身的,一劳永逸的数据类

#!/usr/bin/env python # encoding: utf-8 """ @version: python2.7 @author: Xiangguo Sun @contact: sunxiangguodut@qq.com @site: http://blog.csdn.net/github_36326955 @software: PyCharm @file: get_data @time: 17-7-11 下午1:55 """ import pandas as pd import numpy as np from sqlalchemy import create_engine from sklearn.model_selection import train_test_split import json class Data(object): def __init__(self,big_five='cEXT'): # 永远不变: self.engine = create_engine('mysql+pymysql://root:root@localhost:3306/personality_1', echo=True) self.sample_y5 = self._get_sample_y5() self.sample_x = self._get_sample_x() # 可以用户调节 self.big_five = big_five self.train_size = 0.9 # 用户调节后自动更新的变量 self.sample_y = self._get_sample_y() # only change by big_five self.x_train=None # change by big_five and train_size self.x_test=None # change by big_five and train_size self.y_train=None # change by big_five and train_size self.y_test=None # change by big_five and train_size self.update_train_test() def details(self): return {"sample_x": self.sample_x.shape, "sample_y5": self.sample_y5.shape, "big_five": self.big_five, "train_size": self.train_size, "sample_y": self.sample_y.shape, "x_train": self.x_train.shape, "x_test": self.x_test.shape, "y_train": self.y_train.shape, "y_test": self.y_test.shape} def _get_sample_x(self): df_all = pd.read_sql_table('table_3', self.engine, columns=['line_text']) # read essays all_text = df_all['line_text'] sample_x = [] for text in all_text: # get all_line_text in one text cut_sentence_list = json.loads(text) # type:list (from json to list) sample_x.append(cut_sentence_list) return np.array(sample_x).reshape((-1, 1)) # shape (2467,1) # print ("xx:",self.sample_x.shape) def _get_sample_y5(self): return pd.read_sql_table('essays', self.engine, columns=['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']) # read essays def _get_sample_y(self): return self.sample_y5[self.big_five].reshape((-1, 1)) # shape (2467,1) def set_big_five(self,big_five): """ :param big_five: 'cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN' :return: """ self.big_five=big_five self.sample_y = self._get_sample_y() self.update_train_test() def set_train_size(self,train_size): self.train_size=train_size self.update_train_test() def update_train_test(self): self.x_train,self.x_test,self.y_train,self.y_test = train_test_split(self.sample_x,self.sample_y, random_state=1,train_size=self.train_size) if __name__ =='__main__': data=Data() #data = Data() print (data.details())

11,编写一个一劳永逸的切割句子的类

#!/usr/bin/python # -*- coding:utf8 -*- """ @version: python2.7 @author: Xiangguo Sun @contact: sunxiangguodut@qq.com @site: http://blog.csdn.net/github_36326955 @software: PyCharm @file: RCNN @time: 17-7-13 上午11:46 """ from sqlalchemy import create_engine # mysql orm interface,better than mysqldb import pandas as pd import spacy # a NLP model like NLTK,but more industrial. import json def cut_sentences(df): all_text_name = df["#AUTHID"] # type pandas.Series:get all text name(match the "#AUTHID" in essays) all_text = df["TEXT"] # type pandas.Series:get all text(match the "TEXT" in essays) all_number = all_text_name.index[-1] # from 0 to len(all_text_name)-1 for i in xrange(0,all_number+1,1): print("start to deal with text ", i ," ...") text = all_text[i] # type str:one of text in all_text text_name = all_text_name[i] # type str:one of text_name in all_text_name nlp = spacy.load('en_sm') test_doc = nlp(text.decode()) cut_sentence = [] for sent in test_doc.sents: # get each line in the text cut_sentence.append(sent.text) """ type sent is spacy.tokens.span.Span, not a string, so, we call the member function Span.text to get its unicode form """ cut_sentence_json = json.dumps(cut_sentence) line_number = len(cut_sentence) input_data_dic = {'text_name': text_name, 'line_number':line_number, 'line_text': cut_sentence_json } input_data = pd.DataFrame(input_data_dic,index=[i],columns=['text_name','line_number','line_text']) input_data.to_sql('table_3', engine, if_exists='append', index=False, chunksize=100) """ DataFrame.index will be insert to table by default. We don't want it, so we set the index = False(True default) """ print("text ", i ," finished") if __name__ =='__main__': engine = create_engine('mysql+pymysql://root:root@localhost:3306/personality_1', echo=True) df = pd.read_sql_table('essays', engine,chunksize=5) # read essays for df_iter in df: cut_sentences(df_iter)

12,编写一个word2vect训练模块

#!/usr/bin/env python # encoding: utf-8 """ @version: python2.7 @author: Xiangguo Sun @contact: sunxiangguodut@qq.com @site: http://blog.csdn.net/github_36326955 @software: PyCharm @file: word2vect @time: 17-7-10 下午5:00 """ from sqlalchemy import create_engine # mysql orm interface,better than mysqldb import pandas as pd import spacy # a NLP model like NLTK,but more industrial. import json import gensim import datetime, time """ we use gensim.models.Word2Vec(sentences,size=200,min_count=0,workers=4) to train our word vect model without GPUs 参数列表: min_count=0 修剪内部字典书树 size = 200 神经网络NN层单元数 workers = 4 并行粒度 alpha=0.025 start_time=2017-07-10 19:30 end_time=2017-07-11 05:32:13.441757 totally_time= about 10 hours ubuntu 16.04 LTS 64bit python2.7 IDE PyCharm memory: 7.7GB Intel Core i7-4790 CPU @ 3.60Ghz x 8 """ start_time = time.strftime("%Y-%m-%d %H:%M:%S") with open("./log.txt",'a') as f: f.write(str(start_time).decode()) class MySentences(object): def __init__(self,df_generator): self.df_generator = df_generator def __iter__(self): all_text = self.df_generator['line_text'] count = 0 for text in all_text: count = count+1 now = datetime.datetime.now() # with open("./log.txt", 'a') as f: # f.write((str(now)+"..."+str(count)).decode()) print (str(now) + "..."+str(count)) # get all_line_text in one text cut_sentence_list = json.loads(text) # type:list (from json to list) # step1:首先对每一句话做分词操作,去掉标点符号 # for example: # "hello , there."->['hello','there'] # 得到句子序列 # for example # sentence1: "hello , there." # sentence2: "I'm fine, thanks" # we should get: [['hello','there'],["I'm","fine","thanks"]] nlp = spacy.load('en_sm') stop_word_pos = ["PUNCT", "SPACE", "DET", "ADP"] """ "PUNCT":标点 "SPACE":空格 "DET":the "ADP": 介词 """ #sentences=[] for sentence in cut_sentence_list: sent=[] text_doc = nlp(sentence.decode()) for token in text_doc: if token.pos_ not in stop_word_pos: sent.append(token.text) # type: token.text unicode #sentences.append(sent) yield(sent) #print(sentences) engine = create_engine('mysql+pymysql://root:root@localhost:3306/personality_1', echo=True) df_all = pd.read_sql_table('table_3', engine) # read essays sentences=MySentences(df_all) model = gensim.models.Word2Vec(sentences,size=200,min_count=0,workers=4) """ 参数列表: min_count=0 修剪内部字典书树 size = 200 神经网络NN层单元数 workers = 4 并行粒度 alpha=0.025 """ path = "./mymodel" model.save(path) # # 载入模型语句为: # new_model = gensim.models.Word2Vec.load(path) # #print (model.) # print(new_model.similarity("now", "here"))

13 Error when checking model target: expected activation_2 to have shape (None, 10) but got array with shape (3, 1)

X_train = np.array([[1,2], [6,5], [8,2]]) y_train = np.array([2,3,7]) input_dim = X_train.shape[1] model = Sequential() model.add(Dense(output_dim=64, input_dim=input_dim)) model.add(Activation("relu")) model.add(Dense(output_dim=10)) model.add(Activation("softmax")) model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy']) model.fit(X_train, y_train, nb_epoch=5, batch_size=32)

修正:

I used sparse_categorical_crossentropy to solve my problem

14,如何在终端中运行你的django项目 如果你的django项目是在虚拟环境中开发的,那么,在终端运行时,一定要进入虚拟环境中运行,如下图:

sunxiangguo@sunxiangguo-ubuntu:~/personality_web/bin$ source ./activate (personality_web) sunxiangguo@sunxiangguo-ubuntu:~/personality_web/bin$ cd /home/sunxiangguo/PycharmProjects/personality_web (personality_web) sunxiangguo@sunxiangguo-ubuntu:~/PycharmProjects/personality_web$ python manage.py runserver Performing system checks... System check identified no issues (0 silenced). July 25, 2017 - 11:43:50 Django version 1.9.13, using settings 'personality_web.settings' Starting development server at http://127.0.0.1:8000/ Quit the server with CONTROL-C.
转载请注明原文地址: https://www.6miu.com/read-13312.html

最新回复(0)