今天写一个脚本文件,需要将多个文件中的内容汇总到一个txt文件中,由于多个文件有三种不同的编码方式,读写出现错误,先将解决方法记录如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | # -*- coding: utf-8 -*- import wave import pylab as pl import numpy as np import pandas as pd import os import time import datetime import arrow import chardet import sys reload (sys) sys.setdefaultencoding( 'utf8' ) os.chdir( "F:/new_srt" ) #get words of srt file ########################################### def get_word(): path = "F:/new_srt" filelist = os.listdir(path) for files in filelist: print files encoding = chardet.detect( open (files, 'r' ).read())[ 'encoding' ] if encoding = = 'utf-8' : data = pd.read_csv(files,encoding = "utf-8" ,sep = '\r' ,header = None ) elif encoding = = 'GB2312' : try : data = pd.read_csv(files,encoding = "gbk" ,sep = '\r' ,header = None ) except UnicodeDecodeError: data = pd.read_csv(files,encoding = "utf-8" ,sep = '\r' ,header = None ) elif encoding = = 'UTF-8-SIG' : data = pd.read_csv(files,encoding = "UTF-8-SIG" ,sep = '\r' ,header = None ) else : print 'this is an error about %s' % files data_new = pd.DataFrame(np.reshape(data.values, ( - 1 , 3 ))) data_new.columns = [ 'index' , 'timecut' , 'content' ] filename = os.path.splitext(files)[ 0 ] #filetype = os.path.splitext(files)[1] with open ( 'F:/result.txt' , 'a' ) as file : file .write( str (filename) + ' ' ) for item in data_new[ 'content' ]: file .write(item.decode( "utf-8" ) + ' ' ) #s=s.decode("utf-8") file .write( '\n' ) if __name__ = = '__main__' : get_word() |
以上这篇python 读写文件包含多种编码格式的解决方式就是小编分享给大家的全部内容了,希望能给大家一个参考,也希望大家多多支持脚本之家。
联系客服