Github Copilot 实战: 从零开始用AI写一个OCR工具 (3)

技术分享 12个月前 (05-28) 0 999+

源码

https://github.com/densen2014/Blazor100/tree/master/AI/MiOcr

添加一个屏幕截图功能,显示截图起始点,结束点,截图区域,按键ESC取消截图

这里AI就比较中规中矩,很快就能得到我要的功能了.下面只简单贴一下代码

ScreenCaptureWindow.xaml

<Window x:Class="MiOcr.ScreenCaptureWindow"         xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"         xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"         WindowStyle="None" AllowsTransparency="True" Background="#01000000"         Topmost="True" ShowInTaskbar="False" WindowState="Maximized">     <Canvas x:Name="CaptureCanvas">         <TextBlock x:Name="StartCoordText"                Foreground="Yellow"                Background="#80000000"                FontSize="14"                Visibility="Collapsed"/>         <TextBlock x:Name="CurrentCoordText"                Foreground="Yellow"                Background="#80000000"                FontSize="14"                Visibility="Collapsed"/>         <TextBlock x:Name="SizeText"                Foreground="Yellow"                Background="#80000000"                FontSize="14"                Visibility="Collapsed"/>     </Canvas>  </Window>

ScreenCaptureWindow.xaml.cs

using System.Windows; using System.Windows.Controls; using System.Windows.Input; using System.Windows.Interop; using System.Windows.Media; using System.Windows.Media.Imaging; using System.Windows.Shapes;  namespace MiOcr;  public partial class ScreenCaptureWindow : Window {     public Rect SelectedRect { get; private set; }     public BitmapSource? CapturedImage { get; private set; }      private System.Windows.Point? _start;     private Rectangle? _rectShape;      public ScreenCaptureWindow()     {         InitializeComponent();         MouseLeftButtonDown += OnMouseDown;         MouseMove += OnMouseMove;         MouseLeftButtonUp += OnMouseUp;         Cursor = Cursors.Cross;         PreviewKeyDown += ScreenCaptureWindow_PreviewKeyDown;         Focusable = true;         Loaded += (s, e) => Keyboard.Focus(this);     }      private void ScreenCaptureWindow_PreviewKeyDown(object sender, KeyEventArgs e)     {         if (e.Key == Key.Escape)         {             CapturedImage = null;             DialogResult = false;             Close();         }     }      private void PositionTextBlocks(double x, double y, double w, double h)     {         double margin = 8;         double canvasWidth = CaptureCanvas.ActualWidth;         double canvasHeight = CaptureCanvas.ActualHeight;          // 先测量文本大小         StartCoordText.Measure(new Size(double.PositiveInfinity, double.PositiveInfinity));         SizeText.Measure(new Size(double.PositiveInfinity, double.PositiveInfinity));         double startW = StartCoordText.DesiredSize.Width;         double startH = StartCoordText.DesiredSize.Height;         double sizeW = SizeText.DesiredSize.Width;         double sizeH = SizeText.DesiredSize.Height;          // 1. 左上优先         double startX = x - startW - margin;         double startY = y - startH - margin;         if (startX >= 0 && startY >= 0)         {             Canvas.SetLeft(StartCoordText, startX);             Canvas.SetTop(StartCoordText, startY);             Canvas.SetLeft(SizeText, startX);             Canvas.SetTop(SizeText, startY + startH + 4);             return;         }          // 2. 右上         startX = x + w + margin;         startY = y - startH - margin;         if (startX + startW <= canvasWidth && startY >= 0)         {             Canvas.SetLeft(StartCoordText, startX);             Canvas.SetTop(StartCoordText, startY);             Canvas.SetLeft(SizeText, startX);             Canvas.SetTop(SizeText, startY + startH + 4);             return;         }          // 3. 左下         startX = x - startW - margin;         startY = y + h + margin;         if (startX >= 0 && startY + startH + sizeH + 4 <= canvasHeight)         {             Canvas.SetLeft(StartCoordText, startX);             Canvas.SetTop(StartCoordText, startY);             Canvas.SetLeft(SizeText, startX);             Canvas.SetTop(SizeText, startY + startH + 4);             return;         }          // 4. 右下         startX = x + w + margin;         startY = y + h + margin;         if (startX + startW <= canvasWidth && startY + startH + sizeH + 4 <= canvasHeight)         {             Canvas.SetLeft(StartCoordText, startX);             Canvas.SetTop(StartCoordText, startY);             Canvas.SetLeft(SizeText, startX);             Canvas.SetTop(SizeText, startY + startH + 4);             return;         }          // 5. 屏幕内兜底         Canvas.SetLeft(StartCoordText, Math.Max(margin, Math.Min(canvasWidth - startW - margin, x)));         Canvas.SetTop(StartCoordText, Math.Max(margin, Math.Min(canvasHeight - startH - margin, y)));         Canvas.SetLeft(SizeText, Math.Max(margin, Math.Min(canvasWidth - sizeW - margin, x)));         Canvas.SetTop(SizeText, Math.Max(margin, Math.Min(canvasHeight - sizeH - margin, y + startH + 4)));     }      private void OnMouseDown(object sender, MouseButtonEventArgs e)     {         _start = e.GetPosition(this);         _rectShape = new Rectangle         {             Stroke = Brushes.Red,             StrokeThickness = 2,             Fill = new SolidColorBrush(Color.FromArgb(40, 0, 0, 255))         };         CaptureCanvas.Children.Add(_rectShape);         Canvas.SetLeft(_rectShape, _start.Value.X);         Canvas.SetTop(_rectShape, _start.Value.Y);          StartCoordText.Text = $"起点: ({(int)_start.Value.X}, {(int)_start.Value.Y})";         StartCoordText.Visibility = Visibility.Visible;         CurrentCoordText.Text = $"当前: ({(int)_start.Value.X}, {(int)_start.Value.Y})";         CurrentCoordText.Visibility = Visibility.Visible;         SizeText.Text = $"大小: 0 x 0";         SizeText.Visibility = Visibility.Visible;         // 初始位置         PositionTextBlocks(_start.Value.X, _start.Value.Y, 0, 0);     }      private void OnMouseMove(object sender, MouseEventArgs e)     {         if (_start.HasValue && _rectShape != null)         {             var pos = e.GetPosition(this);             double x = Math.Min(_start.Value.X, pos.X);             double y = Math.Min(_start.Value.Y, pos.Y);             double w = Math.Abs(_start.Value.X - pos.X);             double h = Math.Abs(_start.Value.Y - pos.Y);             Canvas.SetLeft(_rectShape, x);             Canvas.SetTop(_rectShape, y);             _rectShape.Width = w;             _rectShape.Height = h;              // 更新当前点坐标             CurrentCoordText.Text = $"当前: ({(int)pos.X}, {(int)pos.Y})";             Canvas.SetLeft(CurrentCoordText, pos.X + 2);             Canvas.SetTop(CurrentCoordText, pos.Y + 2);              // 更新区域大小             SizeText.Text = $"大小: {(int)w} x {(int)h}";             // 动态调整文本位置             PositionTextBlocks(x, y, w, h);         }     }      private void OnMouseUp(object sender, MouseButtonEventArgs e)     {         if (_start.HasValue && _rectShape != null)         {             var end = e.GetPosition(this);             double x = Math.Min(_start.Value.X, end.X);             double y = Math.Min(_start.Value.Y, end.Y);             double w = Math.Abs(_start.Value.X - end.X);             double h = Math.Abs(_start.Value.Y - end.Y);             SelectedRect = new Rect(x, y, w, h);              // 隐藏坐标             StartCoordText.Visibility = Visibility.Collapsed;             CurrentCoordText.Visibility = Visibility.Collapsed;              // 隐藏区域大小             SizeText.Visibility = Visibility.Collapsed;              // 截图             CapturedImage = CaptureScreenArea(SelectedRect);             DialogResult = true;             Close();         }     }      private BitmapSource CaptureScreenArea(Rect rect)     {         double dpiScale = NativeMethods.GetDpiScale(this);          int x = (int)(rect.X * dpiScale);         int y = (int)(rect.Y * dpiScale);         int w = (int)(rect.Width * dpiScale);         int h = (int)(rect.Height * dpiScale);          IntPtr hdcSrc = NativeMethods.GetDC(IntPtr.Zero);         IntPtr hdcDest = NativeMethods.CreateCompatibleDC(hdcSrc);         IntPtr hBitmap = NativeMethods.CreateCompatibleBitmap(hdcSrc, w, h);         IntPtr hOld = NativeMethods.SelectObject(hdcDest, hBitmap);          NativeMethods.BitBlt(hdcDest, 0, 0, w, h, hdcSrc, x, y, 0x00CC0020); // SRCCOPY          NativeMethods.SelectObject(hdcDest, hOld);         NativeMethods.DeleteDC(hdcDest);         NativeMethods.ReleaseDC(IntPtr.Zero, hdcSrc);          try         {             var source = Imaging.CreateBitmapSourceFromHBitmap(                 hBitmap, IntPtr.Zero, Int32Rect.Empty, BitmapSizeOptions.FromEmptyOptions());             source.Freeze();             return source;         }         finally         {             NativeMethods.DeleteObject(hBitmap);         }     }   }

截图api,不走system.draw

NativeMethods.cs

using System.Windows;  namespace MiOcr;  public static class NativeMethods {     [System.Runtime.InteropServices.DllImport("gdi32.dll")]     public static extern bool DeleteObject(IntPtr hObject);      [System.Runtime.InteropServices.DllImport("user32.dll")]     public static extern IntPtr GetDC(IntPtr hWnd);      [System.Runtime.InteropServices.DllImport("user32.dll")]     public static extern int ReleaseDC(IntPtr hWnd, IntPtr hDC);      [System.Runtime.InteropServices.DllImport("gdi32.dll")]     public static extern IntPtr CreateCompatibleDC(IntPtr hdc);      [System.Runtime.InteropServices.DllImport("gdi32.dll")]     public static extern IntPtr CreateCompatibleBitmap(IntPtr hdc, int nWidth, int nHeight);      [System.Runtime.InteropServices.DllImport("gdi32.dll")]     public static extern IntPtr SelectObject(IntPtr hdc, IntPtr hgdiobj);      [System.Runtime.InteropServices.DllImport("gdi32.dll")]     public static extern bool BitBlt(IntPtr hdcDest, int nXDest, int nYDest, int nWidth, int nHeight,         IntPtr hdcSrc, int nXSrc, int nYSrc, int dwRop);      [System.Runtime.InteropServices.DllImport("gdi32.dll")]     public static extern bool DeleteDC(IntPtr hdc);      /// <summary>     /// 获取指定窗口的 DPI 缩放比例。     /// </summary>     /// <param name="window"></param>     /// <returns></returns>     public static double GetDpiScale(Window window)     {         var source = PresentationSource.FromVisual(window);         if (source?.CompositionTarget != null)         {             return source.CompositionTarget.TransformToDevice.M11; // X 方向缩放         }         return 1.0;     } }

最后主界面添加按钮

        <Button Width="140" Height="44"                 Background="#4361EE"                 Foreground="White"                 FontWeight="SemiBold"                 FontSize="16"                  HorizontalAlignment="Left"                 VerticalAlignment="Top"                 Margin="180,20,20,20"                 Click="CaptureScreenButton_Click">             <Button.Effect>                 <DropShadowEffect ShadowDepth="1" BlurRadius="8" Opacity="0.25" Color="#4361EE"/>             </Button.Effect>             <Button.Content>                 <StackPanel Orientation="Horizontal" VerticalAlignment="Center">                     <TextBlock Text="&#xE8A7;"                         FontFamily="Segoe MDL2 Assets"                        FontSize="20"                        Margin="0,0,8,0"                        VerticalAlignment="Center"/>                     <TextBlock Text="截屏"                        FontSize="16"                        VerticalAlignment="Center"/>                 </StackPanel>             </Button.Content>         </Button>

执行OCR初始化检查模型是否存在, 不存在则回调一个正在初始化提示给UI, 免得用户等待不耐烦

可以通过先检查模型文件是否已存在本地，如果不存在则通知UI“正在初始化/下载模型”，再进行下载。你可以这样实现：

检查模型本地路径是否存在。
如果不存在，调用UI回调（如事件、Action、委托等）提示“正在初始化”。
然后异步下载模型。
假设你用 Action? onStatusChanged 作为回调参数，代码如下：

 // 获取模型本地路径  var modelInfo = OnlineFullModels.ChineseV3;  string modelDir = modelInfo.LocalDir;  if (!Directory.Exists(modelDir) || !File.Exists(Path.Combine(modelDir, "model.pdmodel")))  {      onStatusChanged?.Invoke("正在初始化OCR模型，请稍候...");  }

A: modelInfo.LocalDir; 不存在
A: CreateLocalAsync 方法不存在, 改为异步超时回调UI

这个步骤花了一点时间,因为原本的库就不存在这个功能,所以最终走到这样的代码

明白。你可以通过以下方式实现“异步超时回调UI”：
• 在下载模型前，先启动一个定时任务（如 Task.Delay），如果模型下载还未完成且超时，则回调 UI 提示“正在初始化”。
• 如果模型下载提前完成，则取消提示。
下面是推荐实现方式：

public async Task<(List<string> strings, PaddleOcrResult result)> StartOCR(     Mat src,     Action<string>? onStatusChanged = null,     int timeoutMs = 1500) {     PaddleOcrResult result;     var resultText = new List<string>();     var modelInfo = OnlineFullModels.ChineseV3;      // 启动超时提示任务     using var cts = new CancellationTokenSource();     var timeoutTask = Task.Delay(timeoutMs, cts.Token)         .ContinueWith(t =>         {             if (!t.IsCanceled)             {                 onStatusChanged?.Invoke("正在初始化OCR模型，请稍候...");             }         });      // 下载模型（如果已存在会跳过下载）     FullOcrModel model = await modelInfo.DownloadAsync();     cts.Cancel(); // 下载完成，取消超时提示      using (PaddleOcrAll all = new(model)     {         AllowRotateDetection = true,         Enable180Classification = true,     })     {         Stopwatch sw = Stopwatch.StartNew();         result = all.Run(src);         Console.WriteLine($"elapsed={sw.ElapsedMilliseconds} ms");         Console.WriteLine("Detected all texts: n" + result.Text);         foreach (PaddleOcrResultRegion region in result.Regions)         {             Console.WriteLine($"Text: {region.Text}, Score: {region.Score}, RectCenter: {region.Rect.Center}, RectSize: {region.Rect.Size}, Angle: {region.Rect.Angle}");             resultText.Add(region.Text);         }     }     src.Dispose();     return (resultText, result); }

说明：
• 增加了 timeoutMs 参数（默认1.5秒），可根据需要调整。
• 如果模型下载超过指定时间，UI 会收到“正在初始化OCR模型，请稍候...”提示。
• 如果模型提前下载完成，则不会回调提示。
这样可以优雅地实现异步超时回调UI，无需依赖不存在的 CreateLocalAsync 方法。

StartOCR 处理一下 onStatusChanged 回调, 下载初始化模型反馈友好提示

可以将 PaddleOCRService.StartOCR 的 onStatusChanged 回调参数传递到 UI 层，并在模型下载耗时时友好提示用户。
下面是推荐做法：

PaddleOCRService.cs 已实现超时回调（见前述建议）。
MainWindow.xaml.cs 需要在调用 StartOCR 时传递回调，并在 UI 上显示提示。
步骤一：修改 RunOcrAndDraw，传递 onStatusChanged
将 RunOcrAndDraw 方法修改如下，利用 Dispatcher.Invoke 保证 UI 线程安全：

private async Task RunOcrAndDraw(byte[] imageBytes) {     ProcessingText.Visibility = Visibility.Visible;     ProcessingText.Text = "正在识别...";      var ocr = new PaddleOCRService();     var results = await ocr.StartOCR(imageBytes, msg =>     {         // 友好提示，确保在UI线程         Dispatcher.Invoke(() =>         {             ProcessingText.Text = msg;             ProcessingText.Visibility = Visibility.Visible;         });     });      ProcessingText.Visibility = Visibility.Collapsed;      if (results.strings == null || results.strings.Count == 0)     {         OcrTextBox.Text = "未识别到文本";         return;     }      OcrTextBox.Text = string.Join(Environment.NewLine, results.result.Regions.Select(r => r.Text));     _lastOcrResult = results.result;     var drawed = DrawOcrResultsOnImage(_currentImage, results.result);     MainImage.Source = drawed; }

步骤二：确保 PaddleOCRService.cs 的 StartOCR 支持 onStatusChanged
如前所述，PaddleOCRService.cs 需如下实现：

public async Task<(List<string> strings, PaddleOcrResult result)> StartOCR(     Mat src,     Action<string>? onStatusChanged = null,     int timeoutMs = 1500) {     PaddleOcrResult result;     var resultText = new List<string>();     var modelInfo = OnlineFullModels.ChineseV3;      using var cts = new CancellationTokenSource();     var timeoutTask = Task.Delay(timeoutMs, cts.Token)         .ContinueWith(t =>         {             if (!t.IsCanceled)             {                 onStatusChanged?.Invoke("正在初始化OCR模型，请稍候...");             }         });      FullOcrModel model = await modelInfo.DownloadAsync();     cts.Cancel();      using (PaddleOcrAll all = new(model)     {         AllowRotateDetection = true,         Enable180Classification = true,     })     {         Stopwatch sw = Stopwatch.StartNew();         result = all.Run(src);         foreach (PaddleOcrResultRegion region in result.Regions)         {             resultText.Add(region.Text);         }     }     src.Dispose();     return (resultText, result); }

这样，用户在模型首次下载或初始化时会看到“正在初始化OCR模型，请稍候...”，其余时间显示“正在识别...”，体验更友好。

回顾

项目目标

本项目旨在利用 Github Copilot 辅助开发，从零实现一个基于 AI 的 OCR（光学字符识别）工具。项目采用 .NET 9 和 WPF 技术栈，集成了 PaddleOCR 作为核心识别引擎，实现了图片文字识别、区域选择、结果高亮与复制等实用功能。

主要技术与依赖

• 开发语言与平台：C#，.NET 9，WPF
• OCR引擎：Sdcb.OpenVINO.PaddleOCR
• 图像处理：OpenCvSharp
• 界面交互：WPF，支持拖拽、粘贴、截图等多种图片输入方式
• AI辅助开发：Github Copilot 提供代码建议与自动补全

核心功能

图片输入
• 支持文件选择、拖拽、粘贴、屏幕截图等多种方式加载图片。
OCR识别
• 调用 PaddleOCR 进行文字识别，支持中文、英文等多语种。
• 识别结果实时显示，支持区域高亮和文字复制。
用户体验优化
• 首次模型下载时，异步回调 UI，友好提示“正在初始化OCR模型，请稍候...”，避免用户等待时无响应。
• 识别过程有进度提示，提升交互体验。
结果交互
• 支持鼠标框选图片区域，提取并复制选中区域的文字。
• 右键点击可复制单个识别文本。