mirror of
https://github.com/willmiao/ComfyUI-Lora-Manager.git
synced 2026-06-26 12:51:16 -03:00
fix(aria2): drain stderr pipe to prevent aria2 freeze, retry RPC status on transient failure
Root cause: aria2c subprocess stderr pipe (64 KB buffer) was never drained. When enough error/warning output accumulated, aria2's write() blocked, freezing the entire process including its RPC handler. The tellStatus call then timed out after 30s with asyncio.TimeoutError(), producing the empty error message in 'Failed to query aria2 download status: '. Fixes: - Drain stderr in a background task so pipe never fills up - Retry get_status() RPC calls up to 3 times on transient failure - In the failure path, preserve .safetensors when .aria2 is absent (the download was likely complete on disk)
This commit is contained in:
@@ -84,6 +84,7 @@ class Aria2Downloader:
|
||||
self._transfers: Dict[str, Aria2Transfer] = {}
|
||||
self._poll_interval = 0.5
|
||||
self._state_store = Aria2TransferStateStore()
|
||||
self._stderr_reader_task: Optional[asyncio.Task] = None
|
||||
|
||||
@property
|
||||
def is_running(self) -> bool:
|
||||
@@ -115,7 +116,7 @@ class Aria2Downloader:
|
||||
|
||||
try:
|
||||
while True:
|
||||
status = await self.get_status(download_id)
|
||||
status = await self._get_status_with_retry(download_id)
|
||||
if status is None:
|
||||
return False, "aria2 download not found"
|
||||
|
||||
@@ -136,6 +137,35 @@ class Aria2Downloader:
|
||||
finally:
|
||||
self._transfers.pop(download_id, None)
|
||||
|
||||
async def _get_status_with_retry(
|
||||
self, download_id: str, *, max_retries: int = 3, retry_delay: float = 1.0
|
||||
) -> Optional[Dict[str, Any]]:
|
||||
"""Call get_status with retry for transient RPC failures.
|
||||
|
||||
Only retries on :exc:`Aria2Error` (RPC-level failure). Returns
|
||||
``None`` immediately when the download_id is not tracked (a missing
|
||||
transfer is not a transient condition, so retrying is pointless).
|
||||
|
||||
A single failed RPC call should not immediately fail the download,
|
||||
because aria2 may be temporarily busy (e.g. finalizing multiple
|
||||
concurrent downloads) and a retry will often succeed.
|
||||
"""
|
||||
last_exc: Optional[Exception] = None
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
return await self.get_status(download_id)
|
||||
except Aria2Error as exc:
|
||||
last_exc = exc
|
||||
if attempt < max_retries - 1:
|
||||
logger.warning(
|
||||
"aria2 get_status transient failure (attempt %d/%d) for %s: %s",
|
||||
attempt + 1, max_retries, download_id, exc,
|
||||
)
|
||||
await asyncio.sleep(retry_delay)
|
||||
raise Aria2Error(
|
||||
f"Failed to query aria2 download status after {max_retries} attempts: {last_exc}"
|
||||
) from last_exc
|
||||
|
||||
async def _schedule_download(
|
||||
self,
|
||||
url: str,
|
||||
@@ -312,6 +342,16 @@ class Aria2Downloader:
|
||||
async def close(self) -> None:
|
||||
"""Shut down the RPC process and session."""
|
||||
|
||||
# Cancel the background stderr reader first so it stops reading
|
||||
# from the pipe before the subprocess is terminated.
|
||||
if self._stderr_reader_task is not None:
|
||||
self._stderr_reader_task.cancel()
|
||||
try:
|
||||
await asyncio.wait_for(self._stderr_reader_task, timeout=2.0)
|
||||
except (asyncio.CancelledError, asyncio.TimeoutError):
|
||||
pass
|
||||
self._stderr_reader_task = None
|
||||
|
||||
if self._rpc_session is not None:
|
||||
await self._rpc_session.close()
|
||||
self._rpc_session = None
|
||||
@@ -331,6 +371,23 @@ class Aria2Downloader:
|
||||
process.kill()
|
||||
await process.wait()
|
||||
|
||||
async def _drain_stderr(self) -> None:
|
||||
"""Continuously drain aria2's stderr pipe so it never blocks.
|
||||
|
||||
When the 64 KB pipe buffer fills up, aria2's ``write()`` to stderr
|
||||
blocks, which freezes the entire ``aria2c`` process — including its
|
||||
RPC handler. This background task reads lines from stderr as they
|
||||
arrive and forwards them to Python's logger.
|
||||
"""
|
||||
try:
|
||||
assert self._process is not None and self._process.stderr is not None
|
||||
async for line in self._process.stderr:
|
||||
text = line.decode("utf-8", errors="replace").rstrip()
|
||||
if text:
|
||||
logger.debug("aria2 stderr: %s", text)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
async def _dispatch_progress(self, callback, snapshot: DownloadProgress) -> None:
|
||||
try:
|
||||
result = callback(snapshot, snapshot)
|
||||
@@ -463,6 +520,14 @@ class Aria2Downloader:
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
|
||||
# Drain aria2's stderr in a background task so the pipe buffer
|
||||
# never fills up. If the pipe blocks, aria2 itself freezes and
|
||||
# cannot respond to RPC — this was the root cause of the
|
||||
# "Failed to query aria2 download status" timeout bug.
|
||||
self._stderr_reader_task = asyncio.create_task(
|
||||
self._drain_stderr()
|
||||
)
|
||||
|
||||
await self._wait_until_ready()
|
||||
|
||||
def _resolve_executable(self) -> str:
|
||||
|
||||
@@ -2029,7 +2029,21 @@ class DownloadManager:
|
||||
break
|
||||
|
||||
last_error = result
|
||||
if os.path.exists(save_path):
|
||||
# For aria2: if the .aria2 control file is missing, aria2 considers
|
||||
# the download complete. A transient RPC failure may have made us
|
||||
# think the download failed even though the file is fully on disk.
|
||||
# Keep the file so a retry can find it already complete.
|
||||
if (
|
||||
transfer_backend == "aria2"
|
||||
and os.path.exists(save_path)
|
||||
and not os.path.exists(f"{save_path}.aria2")
|
||||
):
|
||||
logger.warning(
|
||||
"aria2 download reported failure but .aria2 file is absent "
|
||||
"for %s — the file is likely complete. Preserving it for retry.",
|
||||
save_path,
|
||||
)
|
||||
elif os.path.exists(save_path):
|
||||
try:
|
||||
os.remove(save_path)
|
||||
except Exception as e:
|
||||
|
||||
Reference in New Issue
Block a user