Skip to content

Commit 94f8912

Browse files
Merge e94d635 into 62292ca
2 parents 62292ca + e94d635 commit 94f8912

8 files changed

Lines changed: 2035 additions & 2 deletions

File tree

pyproject.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ dependencies = [
4949
"mdx_truly_sane_lists==1.3",
5050
"markdown-link-attr-modifier==0.2.1",
5151
"mdx-gh-links==0.4",
52+
# local image caption
53+
"onnxruntime == 1.19.2",
5254
]
5355

5456
[project.urls]
@@ -322,11 +324,11 @@ lint = [
322324
"pyright==1.1.401",
323325
]
324326
license-check = [
325-
"licensecheck==2024.3",
327+
"licensecheck==2025.1",
326328
]
327329
dev-docs = [
328330
# For building developer documentation
329-
"sphinx==8.1.2",
331+
"sphinx==8.1.3",
330332
"sphinx_rtd_theme==3.0.1",
331333
]
332334
system-tests = [

source/_localCaptioner/__init__.py

Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
# -*- coding: UTF-8 -*-
2+
# A part of NonVisual Desktop Access (NVDA)
3+
# Copyright (C) 2025 NV Access Limited, tianze
4+
# This file may be used under the terms of the GNU General Public License, version 2 or later, as modified by the NVDA license.
5+
# For full terms and any additional permissions, see the NVDA license file: https://github.com/nvaccess/nvda/blob/master/copying.txt
6+
"""Localcaptioner module for NVDA.
7+
8+
This module provides local image captioning functionality using ONNX models.
9+
It allows users to capture screen regions and generate captions using local AI models.
10+
"""
11+
12+
from __future__ import unicode_literals
13+
14+
import os
15+
import sys
16+
from typing import Optional
17+
import base64
18+
import json
19+
import io
20+
import threading
21+
22+
import wx
23+
import gui
24+
from gui import guiHelper
25+
import globalVars
26+
import config
27+
from logHandler import log
28+
from keyboardHandler import KeyboardInputGesture
29+
import scriptHandler
30+
import ui
31+
import globalPluginHandler
32+
import api
33+
34+
from .captioner import ImageCaptioner
35+
from .modelManager import ModelManagerFrame
36+
from .panel import CaptionLocalSettingsPanel
37+
38+
try:
39+
import addonHandler
40+
41+
addonHandler.initTranslation()
42+
except:
43+
pass
44+
45+
# Module-level configuration
46+
_here = os.path.dirname(__file__)
47+
_modelsDir = os.path.join(_here, "..", "..", "models")
48+
_modelsDir = os.path.abspath(_modelsDir)
49+
50+
CONFSPEC = {
51+
"localModelPath": f"string(default={_modelsDir}/Xenova/vit-gpt2-image-captioning)",
52+
"loadModelWhenInit": "boolean(default=false)",
53+
}
54+
55+
config.conf.spec["captionLocal"] = CONFSPEC
56+
57+
58+
def shootImage() -> bytes:
59+
"""Capture a screenshot of the current navigator object.
60+
61+
Returns:
62+
The captured image data as bytes in JPEG format.
63+
"""
64+
# Get the currently focused object on screen
65+
obj = api.getNavigatorObject()
66+
67+
# Get the object's position and size information
68+
x, y, width, height = obj.location
69+
70+
# Create a bitmap with the same size as the object
71+
bmp = wx.Bitmap(width, height)
72+
73+
# Create a memory device context for drawing operations on the bitmap
74+
mem = wx.MemoryDC(bmp)
75+
76+
# Copy the specified screen region to the memory bitmap
77+
mem.Blit(0, 0, width, height, wx.ScreenDC(), x, y)
78+
79+
# Convert the bitmap to an image object for more flexible operations
80+
image = bmp.ConvertToImage()
81+
82+
# Create a byte stream object to save image data as binary data
83+
body = io.BytesIO()
84+
85+
# Save the image to the byte stream in JPEG format
86+
image.SaveFile(body, wx.BITMAP_TYPE_JPEG)
87+
88+
# Read the binary image data from the byte stream
89+
imageData = body.getvalue()
90+
return imageData
91+
92+
93+
def caption(captioner: ImageCaptioner, imageData: bytes) -> None:
94+
"""Generate a caption for the given image data.
95+
96+
Args:
97+
captioner: The captioner instance to use for generation.
98+
imageData: The image data to caption.
99+
"""
100+
try:
101+
description = captioner.generate_caption(image=imageData)
102+
ui.message(description)
103+
result = api.copyToClip(text=description, notify=False)
104+
except Exception as e:
105+
ui.message(str(e))
106+
log.error(e)
107+
108+
109+
def disableInSecureMode(decoratedCls):
110+
if globalVars.appArgs.secure:
111+
return globalPluginHandler.GlobalPlugin
112+
return decoratedCls
113+
114+
115+
@disableInSecureMode
116+
# class GlobalPlugin(globalPluginHandler.GlobalPlugin):
117+
class LocalCaptioner:
118+
"""Global plugin for Caption Local functionality.
119+
120+
This plugin provides image captioning using local ONNX models.
121+
It can capture screen regions and generate descriptive captions.
122+
"""
123+
124+
def __init__(self) -> None:
125+
"""Initialize the global plugin."""
126+
# super().__init__()
127+
self.isModelLoaded = False
128+
self.captioner: Optional[ImageCaptioner] = None
129+
self.managerFrame: Optional[ModelManagerFrame] = None
130+
131+
loadModelWhenInit = config.conf["captionLocal"]["loadModelWhenInit"]
132+
# Load model when initializing plugin (may cause high memory usage)
133+
if loadModelWhenInit:
134+
threading.Thread(target=self._loadModel, daemon=True).start()
135+
136+
gui.settingsDialogs.NVDASettingsDialog.categoryClasses.append(CaptionLocalSettingsPanel)
137+
138+
def terminate(self) -> None:
139+
"""Clean up resources when the plugin is terminated."""
140+
try:
141+
gui.settingsDialogs.NVDASettingsDialog.categoryClasses.remove(CaptionLocalSettingsPanel)
142+
except (ValueError, AttributeError):
143+
pass
144+
145+
def runCaption(self, gesture) -> None:
146+
# def script_runCaption(self) -> None:
147+
"""Script to run image captioning on the current navigator object.
148+
149+
Args:
150+
gesture: The input gesture that triggered this script.
151+
"""
152+
imageData = shootImage()
153+
154+
if not self.isModelLoaded:
155+
# Translators: Message when loading the model
156+
ui.message(_("loading model..."))
157+
self._loadModel()
158+
159+
imageThread = threading.Thread(target=caption, args=(self.captioner, imageData))
160+
# Translators: Message when starting image recognition
161+
ui.message(_("starting recognize"))
162+
imageThread.start()
163+
164+
def _loadModel(self) -> None:
165+
"""Load the ONNX model for image captioning.
166+
167+
Raises:
168+
Exception: If the model cannot be loaded.
169+
"""
170+
try:
171+
localModelDirPath = config.conf["captionLocal"]["localModelPath"]
172+
encoderPath = f"{localModelDirPath}/onnx/encoder_model_quantized.onnx"
173+
decoderPath = f"{localModelDirPath}/onnx/decoder_model_merged_quantized.onnx"
174+
configPath = f"{localModelDirPath}/config.json"
175+
176+
self.captioner = ImageCaptioner(
177+
encoder_path=encoderPath,
178+
decoder_path=decoderPath,
179+
config_path=configPath,
180+
)
181+
self.isModelLoaded = True
182+
except Exception as e:
183+
self.isModelLoaded = False
184+
ui.message(str(e))
185+
raise
186+
187+
def releaseModel(self, gesture) -> None:
188+
"""Script to release the loaded model from memory.
189+
190+
Args:
191+
gesture: The input gesture that triggered this script.
192+
"""
193+
# Translators: Message when releasing the model
194+
ui.message(_("releasing model..."))
195+
try:
196+
if hasattr(self, "captioner") and self.captioner:
197+
del self.captioner
198+
self.captioner = None
199+
# Translators: Message when model is successfully released
200+
ui.message(_("model released and memory freed"))
201+
self.isModelLoaded = False
202+
except Exception as e:
203+
ui.message(str(e))
204+
raise
205+
206+
def openManager(self, gesture) -> None:
207+
"""Script to open the model manager window.
208+
209+
Args:
210+
gesture: The input gesture that triggered this script.
211+
"""
212+
# Translators: Message when opening model manager
213+
ui.message(_("opening model manager..."))
214+
try:
215+
self._openModelManager()
216+
except Exception as e:
217+
ui.message(str(e))
218+
raise
219+
220+
def _openModelManager(self) -> None:
221+
"""Open the model manager frame window."""
222+
223+
def showManager() -> None:
224+
"""Show the model manager window."""
225+
try:
226+
# Use existing wx.App if available
227+
app = wx.GetApp()
228+
if app is None:
229+
app = wx.App()
230+
231+
if not hasattr(self, "managerFrame") or not self.managerFrame:
232+
self.managerFrame = ModelManagerFrame()
233+
234+
self.managerFrame.Show()
235+
self.managerFrame.Raise()
236+
237+
except Exception as e:
238+
ui.message(str(e))
239+
240+
# Ensure execution in main thread
241+
wx.CallAfter(showManager)
242+
243+
244+
def getLocalCaptionerConfig():
245+
return config.conf["localcaptioner"]
246+
247+
248+
def initialize():
249+
"""Initialise the local captioner."""
250+
global _localCaptioner
251+
log.debug("Initializing local captioner")
252+
_localCaptioner = LocalCaptioner()
253+
254+
255+
def terminate():
256+
"""Terminate the local captioner."""
257+
global _localCaptioner
258+
if _localCaptioner is None:
259+
log.debug("local captioner not running.")
260+
return
261+
log.debug("Terminating local captioner")
262+
_localCaptioner.terminate()
263+
_localCaptioner = None

0 commit comments

Comments
 (0)