result=pd.read_csv('ch06/ex6.csv')
result
onetwothreefourkey00.467976-0.038649-0.295344-1.824726L1-0.3588931.4044530.704965-0.200638B2-0.5018400.659254-0.421691-0.057688G30.2048861.0741341.388361-0.982404R40.354628-0.1331160.283763-0.837063Q51.8174800.7422730.419395-2.251035Q6-0.7767640.935518-0.332872-1.875641U7-0.9131351.530624-0.5726570.477252K80.358480-0.497572-0.3670160.507702S9-1.740877-1.160417-1.6378302.172201G100.240564-0.3282491.2521551.0727968110.7640181.165476-0.6395441.495258R120.571035-0.3105370.582437-0.2987651132.3176580.430710-1.3342160.199679P141.547771-1.119753-2.2776340.329586J15-1.3106080.401719-1.0009871.156708E16-0.0884960.6347120.1533240.415335B17-0.018663-0.247487-1.4465220.750938A18-0.070127-1.5790970.1208920.671432F19-0.194678-0.4920392.3596050.319810H20-0.2486180.868707-0.492226-0.717959W21-1.091549-0.867110-0.647760-0.832562C220.641404-0.138822-0.621963-0.284839C231.2164080.9926870.165162-0.069619V24-0.5644740.7928320.7470530.571675I251.759879-0.515666-0.2304811.362317S260.1262660.3092810.382820-0.239199L271.334360-0.100152-0.840731-0.643967628-0.7376200.278087-0.053235-0.950972J29-1.148486-0.986292-0.1449630.124362Y………………99700.633495-0.1865240.9276270.143164499710.308636-0.1128570.762842-1.07297719972-1.627051-0.9781510.154745-1.229037Z99730.3148470.0979890.1996080.955193P99741.6669070.9920050.496128-0.686391S99750.0106030.708540-1.2587110.226541K99760.118693-0.714455-0.501342-0.254764K99770.302616-2.011527-0.6280850.768827H9978-0.0985721.769086-0.215027-0.053076A9979-0.0190581.9649940.738538-0.883776F9980-0.5953490.001781-1.423355-1.458477M99811.392170-1.396560-1.425306-0.847535H9982-0.896029-0.1522871.9244830.36518469983-2.274642-0.9018741.5003520.996541N9984-0.3018981.0199061.1021602.624526I9985-2.548389-0.5853741.496201-0.718815D9986-0.0645880.759292-1.568415-0.420933E9987-0.143365-1.111760-1.8155810.43527429988-0.070412-1.0559210.338017-0.440763X99890.6491480.994273-1.3842270.485120Q9990-0.3707690.404356-1.051628-1.05089989991-0.4099800.155627-0.8189901.277350W99920.301214-1.1112030.6682580.671922A99931.8211170.4164450.1738740.505118X99940.0688041.3227590.8023460.223618H99952.311896-0.417070-1.409599-0.515821L9996-0.479893-0.6504190.745152-0.646038E99970.5233310.7871120.4860661.093156K9998-0.3625590.598894-1.8432010.887292G9999-0.096376-1.012999-0.657431-0.5733150
10000 rows × 5 columns
#如果只想读取几行,通过nrows进行指定即可
pd.read_csv('ch06/ex6.csv',nrows=5)
onetwothreefourkey00.467976-0.038649-0.295344-1.824726L1-0.3588931.4044530.704965-0.200638B2-0.5018400.659254-0.421691-0.057688G30.2048861.0741341.388361-0.982404R40.354628-0.1331160.283763-0.837063Q
#要逐块读取文件,需要设置chunksize(行数)
chunker=pd.read_csv('ch06/ex6.csv',chunksize=1000)
chunker
#read_csv返回的这个TextParser对象可以根据chunksize对文件进行逐块迭代from pandas import Series,DataFrame
tot=Series([])
for piece in chunker:
tot=tot.add(piece['key'].value_counts(),fill_value=0)
tot=tot.sort_values(ascending=False)
tot[:10] E 368.0 X 364.0 L 346.0 O 343.0 Q 340.0 M 338.0 J 337.0 F 335.0 K 334.0 H 330.0 dtype: float64
#TextParse还有一个get_chunk方法,它可以使你读取任意大小的块
from lxml import objectify
path='ch06/mta_perf/Performance_MNR.xml'
parsed=objectify.parse(open(path))
root=parsed.getroot()
data=[]
skip_fields=['PARENT_SEQ','INDICATOR_SEQ','DESIRED_CHANGE','DECIMAL_PLACES']
for elt in root.INDICATOR:
el_data={}
for child in elt.getchildren():
if child.tag in skip_fields:
continue
el_data[child.tag]=child.pyval
data.append(el_data)
perf=DataFrame(data)
perf
AGENCY_NAMECATEGORYDESCRIPTIONFREQUENCYINDICATOR_NAMEINDICATOR_UNITMONTHLY_ACTUALMONTHLY_TARGETPERIOD_MONTHPERIOD_YEARYTD_ACTUALYTD_TARGET0Metro-North RailroadService IndicatorsPercent of commuter trains that arrive at thei…MOn-Time Performance (West of Hudson)