76 lines
2.1 KiB
Python
76 lines
2.1 KiB
Python
|
|
"""
|
||
|
|
One-time download of all defeatbeta parquet files + company_tickers.json.
|
||
|
|
Run this once; after that use offline.py for zero-network Ticker() calls.
|
||
|
|
|
||
|
|
uv run python download_data.py
|
||
|
|
uv run python download_data.py --out data/parquet # custom directory
|
||
|
|
"""
|
||
|
|
import argparse
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import requests
|
||
|
|
|
||
|
|
BASE = "https://huggingface.co/datasets/defeatbeta/yahoo-finance-data/resolve/main"
|
||
|
|
|
||
|
|
PARQUET_TABLES = [
|
||
|
|
"stock_profile",
|
||
|
|
"stock_officers",
|
||
|
|
"stock_tailing_eps",
|
||
|
|
"stock_earning_calendar",
|
||
|
|
"stock_statement",
|
||
|
|
"stock_prices",
|
||
|
|
"stock_dividend_events",
|
||
|
|
"stock_split_events",
|
||
|
|
"exchange_rate",
|
||
|
|
"daily_treasury_yield",
|
||
|
|
"stock_earning_call_transcripts",
|
||
|
|
"stock_news",
|
||
|
|
"stock_revenue_breakdown",
|
||
|
|
"stock_shares_outstanding",
|
||
|
|
"stock_sec_filing",
|
||
|
|
]
|
||
|
|
|
||
|
|
EXTRA_FILES = [
|
||
|
|
("data/company_tickers.json", "company_tickers.json"),
|
||
|
|
]
|
||
|
|
|
||
|
|
|
||
|
|
def download(url: str, dest: Path, label: str) -> None:
|
||
|
|
if dest.exists():
|
||
|
|
print(f" skip {label} ({dest.stat().st_size / 1e6:.1f} MB on disk)")
|
||
|
|
return
|
||
|
|
print(f" fetch {label} ...", end="", flush=True)
|
||
|
|
with requests.get(url, stream=True, timeout=60) as r:
|
||
|
|
r.raise_for_status()
|
||
|
|
tmp = dest.with_suffix(".tmp")
|
||
|
|
with open(tmp, "wb") as f:
|
||
|
|
for chunk in r.iter_content(chunk_size=8 * 1024 * 1024):
|
||
|
|
f.write(chunk)
|
||
|
|
tmp.rename(dest)
|
||
|
|
print(f" {dest.stat().st_size / 1e6:.1f} MB")
|
||
|
|
|
||
|
|
|
||
|
|
def main() -> None:
|
||
|
|
parser = argparse.ArgumentParser()
|
||
|
|
parser.add_argument("--out", default="data/parquet", help="local output directory")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
out = Path(args.out)
|
||
|
|
out.mkdir(parents=True, exist_ok=True)
|
||
|
|
print(f"Saving to: {out.resolve()}\n")
|
||
|
|
|
||
|
|
for table in PARQUET_TABLES:
|
||
|
|
url = f"{BASE}/data/{table}.parquet"
|
||
|
|
download(url, out / f"{table}.parquet", table)
|
||
|
|
|
||
|
|
for remote_path, local_name in EXTRA_FILES:
|
||
|
|
url = f"{BASE}/{remote_path}"
|
||
|
|
download(url, out / local_name, local_name)
|
||
|
|
|
||
|
|
print(f"\nDone. {sum(1 for _ in out.iterdir())} files in {out.resolve()}")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|