fix(aria2): drain stderr pipe to prevent aria2 freeze, retry RPC status on transient failure

Root cause: aria2c subprocess stderr pipe (64 KB buffer) was never
drained. When enough error/warning output accumulated, aria2's write()
blocked, freezing the entire process including its RPC handler. The
tellStatus call then timed out after 30s with asyncio.TimeoutError(),
producing the empty error message in 'Failed to query aria2 download
status: '.

Fixes:
- Drain stderr in a background task so pipe never fills up
- Retry get_status() RPC calls up to 3 times on transient failure
- In the failure path, preserve .safetensors when .aria2 is absent
  (the download was likely complete on disk)
This commit is contained in:
Will Miao
2026-06-26 08:25:05 +08:00
parent 0ac10dfd42
commit 3a2941d751
3 changed files with 152 additions and 2 deletions

View File

@@ -352,3 +352,74 @@ async def test_resolve_authenticated_redirect_url_returns_location(monkeypatch):
)
assert result == "https://signed.example.com/file.safetensors"
@pytest.mark.asyncio
async def test_get_status_with_retry_passes_through_success(monkeypatch):
"""A successful first call returns immediately, no retries."""
downloader = Aria2Downloader()
call_count = 0
async def fake_get_status(_id):
nonlocal call_count
call_count += 1
return {"status": "active", "completedLength": "50", "totalLength": "100"}
monkeypatch.setattr(downloader, "get_status", fake_get_status)
result = await downloader._get_status_with_retry("dummy")
assert result is not None
assert result["status"] == "active"
assert call_count == 1
@pytest.mark.asyncio
async def test_get_status_with_retry_succeeds_after_transient_failure(monkeypatch):
"""A transient Aria2Error on the first call is retried and succeeds."""
downloader = Aria2Downloader()
call_count = 0
async def fake_get_status(_id):
nonlocal call_count
call_count += 1
if call_count == 1:
raise Aria2Error("timeout")
return {"status": "complete", "completedLength": "100", "totalLength": "100"}
monkeypatch.setattr(downloader, "get_status", fake_get_status)
monkeypatch.setattr("py.services.aria2_downloader.asyncio.sleep", AsyncMock())
result = await downloader._get_status_with_retry("dummy")
assert result is not None
assert result["status"] == "complete"
assert call_count == 2
@pytest.mark.asyncio
async def test_get_status_with_retry_raises_after_all_retries_exhausted(monkeypatch):
"""All retry attempts fail → Aria2Error with a descriptive message."""
downloader = Aria2Downloader()
async def fake_get_status(_id):
raise Aria2Error("connection reset")
monkeypatch.setattr(downloader, "get_status", fake_get_status)
monkeypatch.setattr("py.services.aria2_downloader.asyncio.sleep", AsyncMock())
with pytest.raises(Aria2Error) as exc_info:
await downloader._get_status_with_retry("dummy")
msg = str(exc_info.value)
assert "after 3 attempts" in msg
assert "connection reset" in msg
@pytest.mark.asyncio
async def test_get_status_with_retry_returns_none_when_not_tracked(monkeypatch):
"""No transfer in _transfers → get_status returns None → no retry needed."""
downloader = Aria2Downloader()
# get_status returns None when the download_id has no transfer;
# _get_status_with_retry should propagate that without raising.
result = await downloader._get_status_with_retry("nonexistent")
assert result is None