Optimize download_binance_kline.py: extract global proxy config, fix warnings, refactor main function

This commit is contained in:
yhydev
2026-01-14 11:17:06 +08:00
parent 1849e67f54
commit 27a8e3d64b

View File

@@ -21,6 +21,12 @@ INTERVAL = "1d"
START_DATE = "2021-01"
END_DATE = datetime.now().strftime("%Y-%m")
# 代理配置
PROXIES = {
'http': 'http://localhost:1080',
'https': 'http://localhost:1080'
}
# PostgreSQL配置
DB_CONFIG = {
"host": "localhost",
@@ -45,14 +51,8 @@ def download_kline_data(symbol, interval, year_month):
url = f"{BASE_URL}{symbol}/{interval}/{filename}"
logger.info(f"Downloading {url}")
# 配置代理
proxies = {
'http': 'http://localhost:1080',
'https': 'http://localhost:1080'
}
try:
response = requests.get(url, proxies=proxies)
response = requests.get(url, proxies=PROXIES)
response.raise_for_status()
logger.info(f"Downloaded {filename} successfully")
return BytesIO(response.content)
@@ -136,8 +136,15 @@ def insert_data(conn, df):
ON CONFLICT (symbol, open_time) DO NOTHING;
"""
# 定义要插入的列顺序
insert_columns = [
"symbol", "open_time", "open", "high", "low", "close", "volume",
"close_time", "quote_asset_volume", "number_of_trades",
"taker_buy_base_asset_volume", "taker_buy_quote_asset_volume", "ignore"
]
# 转换DataFrame为元组列表
data = [tuple(row) for row in df[df.columns[1:]].to_numpy()] # 跳过id列
data = [tuple(row) for row in df[insert_columns].to_numpy()]
try:
with conn.cursor() as cur:
@@ -161,14 +168,8 @@ def list_s3_files(url, timeout=10):
"""
logger.info(f"Listing files from {url}")
# 配置代理
proxies = {
'http': 'http://localhost:1080',
'https': 'http://localhost:1080'
}
try:
response = requests.get(url, timeout=timeout, proxies=proxies)
response = requests.get(url, timeout=timeout, proxies=PROXIES)
response.raise_for_status()
# 解析XML响应
@@ -214,14 +215,8 @@ def download_kline_data_by_url(url):
"""
logger.info(f"Downloading {url}")
# 配置代理
proxies = {
'http': 'http://localhost:1080',
'https': 'http://localhost:1080'
}
try:
response = requests.get(url, proxies=proxies)
response = requests.get(url, proxies=PROXIES)
response.raise_for_status()
filename = os.path.basename(url)
logger.info(f"Downloaded {filename} successfully")
@@ -266,9 +261,9 @@ def process_symbol(symbol, interval=INTERVAL):
# 添加symbol列
df["symbol"] = symbol
# 转换时间戳为datetime
df["open_time"] = pd.to_datetime(df["open_time"], unit='ms')
df["close_time"] = pd.to_datetime(df["close_time"], unit='ms')
# 转换时间戳为datetime先转换为数值类型以避免FutureWarning
df["open_time"] = pd.to_datetime(df["open_time"].astype(float), unit='ms')
df["close_time"] = pd.to_datetime(df["close_time"].astype(float), unit='ms')
all_dfs.append(df)
logger.info(f"Processed {os.path.basename(file_url)} with {len(df)} rows")
@@ -306,55 +301,15 @@ def process_symbol(symbol, interval=INTERVAL):
def main():
# 创建数据库连接
conn = create_connection()
if not conn:
return
# 创建表
create_table(conn)
"""主函数,处理所有配置的交易对"""
logger.info("Starting main process for symbols: %s", SYMBOLS)
for symbol in SYMBOLS:
# 使用list_s3_files函数获取可用的文件URL列表
s3_url = f"https://s3-ap-northeast-1.amazonaws.com/data.binance.vision?delimiter=/&prefix=data/futures/um/monthly/klines/{symbol}/{INTERVAL}/"
file_urls = list_s3_files(s3_url)
try:
process_symbol(symbol, INTERVAL)
except Exception as e:
logger.error(f"Failed to process symbol {symbol}: {e}")
if not file_urls:
logger.warning(f"No files found for {symbol}-{INTERVAL}")
continue
# 处理每个文件URL
for file_url in file_urls:
# 从URL中提取文件名
filename = os.path.basename(file_url)
# 检查文件名格式
if not filename.endswith('.zip'):
continue
# 解析文件名,提取交易对和年月信息
# 格式: symbol-interval-year-month.zip
parts = filename[:-4].split('-') # 移除.zip后缀并拆分
if len(parts) != 4:
logger.warning(f"Invalid filename format: {filename}")
continue
file_symbol, file_interval, year, month = parts
# 下载数据
zip_data = download_kline_data_by_url(file_url)
if not zip_data:
continue
# 解析数据
df = parse_kline_data(zip_data, file_symbol)
if df is None or df.empty:
continue
# 插入数据
insert_data(conn, df)
# 关闭连接
conn.close()
logger.info("Script completed successfully")
if __name__ == "__main__":