初始提交
This commit is contained in:
104
download_unzip_csv.py
Normal file
104
download_unzip_csv.py
Normal file
@@ -0,0 +1,104 @@
|
||||
import requests
|
||||
import pandas as pd
|
||||
from io import BytesIO
|
||||
import logging
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def download_unzip_csv(url, **kwargs):
|
||||
"""
|
||||
从URL下载文件,解压并返回CSV内容作为pandas DataFrame
|
||||
|
||||
参数:
|
||||
url: 文件下载URL
|
||||
**kwargs: 传递给pandas.read_csv的额外参数
|
||||
|
||||
返回:
|
||||
pandas.DataFrame: 解压后的CSV内容
|
||||
"""
|
||||
logger.info(f"Downloading file from {url}")
|
||||
|
||||
try:
|
||||
# 下载文件
|
||||
response = requests.get(url)
|
||||
response.raise_for_status() # 检查下载是否成功
|
||||
logger.info(f"Successfully downloaded file from {url}")
|
||||
|
||||
# 使用BytesIO处理下载的内容
|
||||
zip_data = BytesIO(response.content)
|
||||
|
||||
# 读取并解压CSV文件
|
||||
logger.info("Reading and decompressing CSV file")
|
||||
df = pd.read_csv(zip_data, compression='zip', **kwargs)
|
||||
|
||||
# 智能判定首行是否为标题行
|
||||
def is_header_row(row):
|
||||
"""智能检测行是否为标题行"""
|
||||
# 检查条件
|
||||
conditions = [
|
||||
# 1. 首行包含常见的时间相关列名
|
||||
any(keyword in str(cell).lower() for keyword in ['time', 'date', 'datetime', 'timestamp'] for cell in row),
|
||||
# 2. 首行包含常见的价格相关列名
|
||||
any(keyword in str(cell).lower() for keyword in ['open', 'high', 'low', 'close', 'volume'] for cell in row),
|
||||
# 3. 首行包含常见的交易相关列名
|
||||
any(keyword in str(cell).lower() for keyword in ['taker', 'quote', 'count', 'ignore'] for cell in row),
|
||||
# 4. 首行全为字符串,而第二行包含数值
|
||||
len(df) > 1 and all(isinstance(str(cell), str) and not str(cell).replace('.', '').isdigit() for cell in row) and \
|
||||
any(str(cell).replace('.', '').isdigit() for cell in df.iloc[1] if pd.notna(cell)),
|
||||
# 5. 首行包含'_'字符(常见于编程命名的列名)
|
||||
any('_' in str(cell) for cell in row)
|
||||
]
|
||||
return any(conditions)
|
||||
|
||||
if len(df) > 0 and is_header_row(df.iloc[0]):
|
||||
df = df[1:].reset_index(drop=True)
|
||||
logger.info("Skipped header row")
|
||||
|
||||
# 转换数值列
|
||||
numeric_columns = ['open', 'high', 'low', 'close', 'volume',
|
||||
'close_time', 'quote_asset_volume', 'number_of_trades',
|
||||
'taker_buy_volume', 'taker_buy_quote_volume', 'taker_buy_base_asset_volume',
|
||||
'taker_buy_quote_asset_volume', 'ignore']
|
||||
|
||||
# 只转换存在的列
|
||||
for col in numeric_columns:
|
||||
if col in df.columns:
|
||||
try:
|
||||
if col in ['number_of_trades', 'count']:
|
||||
df[col] = df[col].astype(int)
|
||||
else:
|
||||
df[col] = df[col].astype(float)
|
||||
except ValueError:
|
||||
logger.warning(f"Could not convert column {col} to numeric type")
|
||||
|
||||
logger.info(f"Successfully parsed CSV file with {len(df)} rows")
|
||||
return df
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error(f"Failed to download file: {e}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process file: {e}")
|
||||
raise
|
||||
|
||||
if __name__ == "__main__":
|
||||
# 测试函数
|
||||
test_url = "https://data.binance.vision/data/futures/um/monthly/klines/BTCUSDT/1d/BTCUSDT-1d-2024-01.zip"
|
||||
|
||||
# CSV列名
|
||||
columns = [
|
||||
"open_time", "open", "high", "low", "close", "volume",
|
||||
"close_time", "quote_asset_volume", "number_of_trades",
|
||||
"taker_buy_base_asset_volume", "taker_buy_quote_asset_volume", "ignore"
|
||||
]
|
||||
|
||||
try:
|
||||
df = download_unzip_csv(test_url, header=None, names=columns)
|
||||
print(f"DataFrame shape: {df.shape}")
|
||||
print("DataFrame head:")
|
||||
print(df.head())
|
||||
print("DataFrame dtypes:")
|
||||
print(df.dtypes)
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
Reference in New Issue
Block a user