Desktop Control Skill

High-performance desktop automation for OpenClaw. Optimized for minimal latency, efficient resource usage, and robust error handling.

🚀 Quick Start

from desktop_control import DesktopController

# Context manager ensures cleanup
with DesktopController() as dc:
    dc.move_mouse(500, 300)
    dc.click()
    dc.type_text("Hello World!")

📦 Installation

pip install -r requirements.txt

✨ Features

Mouse Control

Instant movements (0ms overhead)
Smooth animations with easing curves
Relative positioning from current location
Click variations: left, right, middle, double
Drag & drop with configurable duration
Scroll: vertical and horizontal

Keyboard Control

Type text with WPM or interval control
Hotkeys (Ctrl+C, Alt+Tab, etc.)
Key combinations via context manager
Individual key press/hold/release

Screen Operations

Fast screenshots to file or PIL Image
Pixel color reading
Image finding on screen (OpenCV)
Region capture for performance
Wait for image with timeout

Window Management

List all windows
Activate by title (partial match)
Get active window

Safety Features

Corner failsafe - move mouse to any corner to abort
Bounds checking - prevents off-screen coordinates
Retry logic - auto-retry flaky operations
Approval mode - ask before each action

🔧 API Reference

DesktopController

DesktopController(
    failsafe: bool = True,          # Abort on corner detection
    require_approval: bool = False,  # Ask before actions
    log_level: int = logging.INFO   # Verbosity
)

Context Manager Support:

with DesktopController() as dc:
    # Automatic cleanup on exit
    dc.type_text("Hello")

Mouse Operations

`move_mouse(x, y, duration=0, smooth=True)`

Move to absolute coordinates instantly or with animation.

dc.move_mouse(1000, 500)                    # Instant
dc.move_mouse(1000, 500, duration=0.5)      # Smooth 0.5s
dc.move_mouse(1000, 500, smooth=False)      # Linear movement

`move_relative(dx, dy, duration=0)`

Move from current position.

dc.move_relative(100, -50)   # Right 100, Up 50

`click(x=None, y=None, button='left', clicks=1, interval=0.05)`

Click with retry logic.

dc.click()                          # Current position
dc.click(500, 300)                  # Specific position
dc.click(button='right')            # Right click
dc.double_click(500, 300)           # Double click (convenience)

`drag(x1, y1, x2, y2, duration=0.3, button='left')`

Drag from point to point.

# Drag file from A to B
dc.drag(200, 200, 800, 500, duration=0.5)

`scroll(amount, x=None, y=None, horizontal=False)`

Scroll wheel.

dc.scroll(-5)              # Down 5 clicks
dc.scroll(10, 500, 300)    # Scroll at position
dc.scroll(3, horizontal=True)  # Horizontal

Keyboard Operations

`type_text(text, interval=None, wpm=None)`

Type text with speed control.

dc.type_text("Instant!", interval=0)      # No delay
dc.type_text("Human-like", wpm=80)        # 80 words per minute
dc.type_text("Custom", interval=0.1)     # 100ms between keys

`hotkey(*keys, interval=0.01)`

Execute keyboard shortcuts.

dc.hotkey('ctrl', 'c')      # Copy
dc.hotkey('alt', 'tab')     # Switch window
dc.hotkey('ctrl', 'shift', 'esc')  # Task manager

`press(key, presses=1, interval=0.05)`

Press individual keys.

dc.press('enter')
dc.press('space', presses=3)
dc.press('f5')              # Refresh

Key Hold Context Manager

Hold keys during a block:

with dc.hold_keys('ctrl', 'shift'):
    dc.press('end')         # Ctrl+Shift+End
    dc.click(500, 300)      # Ctrl+Shift+Click

Screen Operations

`screenshot(region=None, filename=None)`

Capture screen with retry logic.

# Full screen to PIL Image
img = dc.screenshot()

# Save to file (returns None)
dc.screenshot(filename="screen.png")

# Region only (left, top, width, height)
dc.screenshot(region=(100, 100, 800, 600), filename="region.png")

`screenshot_to_file(filename, region=None)`

Convenience method that returns filename.

path = dc.screenshot_to_file(f"capture_{time.time()}.png")

`get_pixel(x, y)`

Get RGB color at coordinates.

r, g, b = dc.get_pixel(500, 300)

`find_on_screen(image_path, confidence=0.9, grayscale=True)`

Find image on screen.

# Returns (x, y, width, height) or None
location = dc.find_on_screen("button.png", confidence=0.9)
if location:
    x, y, w, h = location
    dc.click(x + w//2, y + h//2)  # Click center

`find_all_on_screen(image_path, confidence=0.9)`

Find all occurrences.

matches = dc.find_all_on_screen("icon.png")
for x, y, w, h in matches:
    print(f"Found at ({x}, {y})")

`wait_for_image(image_path, timeout=10.0, interval=0.5)`

Wait for image to appear.

location = dc.wait_for_image("loading_complete.png", timeout=30)
if location:
    print("Ready!")

Window Operations

`get_all_windows()`

List all window titles.

windows = dc.get_all_windows()
for title in windows:
    if "Chrome" in title:
        print(f"Found: {title}")

`activate_window(title, partial=True)`

Bring window to front.

dc.activate_window("Chrome")        # Partial match
dc.activate_window("Untitled - Notepad", partial=False)  # Exact

`get_active_window()`

Get current window title.

current = dc.get_active_window()

`find_window(title_substring)`

Find window by partial title.

title = dc.find_window("Document")
if title:
    dc.activate_window(title)

Clipboard

# Copy to clipboard
dc.copy_to_clipboard("Hello")

# Get from clipboard
text = dc.get_clipboard()

Utility

# Sleep (same as time.sleep)
dc.sleep(0.5)

# Wait for key press
dc.wait_for_key('space', timeout=10.0)

# Show alert
dc.alert("Operation complete!")

# Confirm dialog
if dc.confirm("Proceed?"):
    print("User said yes")

🎯 Performance Tips

Use context managers for automatic cleanup:

with DesktopController() as dc:
    # operations

Minimize screenshots in loops:

# Bad - screenshot every iteration
for _ in range(100):
    dc.screenshot()

# Better - cache or use find_on_screen with interval

Use instant movements when smoothness isn't needed:
```
dc.move_mouse(x, y, duration=0)  # vs 0.5s
```

Batch keyboard operations with hotkeys:

# One hotkey call
dc.hotkey('ctrl', 'a')

# Instead of multiple key presses

Use grayscale for image matching (2x faster):

dc.find_on_screen("image.png", grayscale=True)

🛡️ Safety

Failsafe Mode

# Move mouse to any corner to abort all automation
dc = DesktopController(failsafe=True)

Approval Mode

# Ask before each action
dc = DesktopController(require_approval=True)
dc.click(500, 300)  # Prompts: "Allow left click at (500, 300)? [y/n]"

🔌 AI Agent (Optional)

For cognitive automation with task planning:

from ai_agent import create_agent

with create_agent() as agent:
    result = agent.execute_task("Type Hello World in Notepad")
    print(f"Status: {result.status}")

See ai_agent.py for full capabilities.

📋 Quick Reference

from desktop_control import (
    DesktopController, get_controller,
    move, click, typewrite, hotkey, screenshot
)

# Initialize
dc = DesktopController()

# Mouse
dc.move_mouse(x, y)
dc.click(x, y)
dc.drag(x1, y1, x2, y2)
dc.scroll(amount)
pos = dc.get_mouse_position()

# Keyboard
dc.type_text(text, wpm=80)
dc.hotkey('ctrl', 'c')
dc.press('enter')

with dc.hold_keys('ctrl'):
    dc.click(x, y)

# Screen
img = dc.screenshot()
dc.screenshot_to_file("out.png")
color = dc.get_pixel(x, y)
loc = dc.find_on_screen("image.png")

# Window
windows = dc.get_all_windows()
dc.activate_window("Chrome")

Built for OpenClaw - Optimized for speed and reliability 🦞

lazy

Safety Notice

Copy this and send it to your AI assistant to learn

Desktop Control Skill

🚀 Quick Start

📦 Installation

✨ Features

Mouse Control

Keyboard Control

Screen Operations

Window Management

Safety Features

🔧 API Reference

DesktopController

Mouse Operations

move_mouse(x, y, duration=0, smooth=True)

move_relative(dx, dy, duration=0)

click(x=None, y=None, button='left', clicks=1, interval=0.05)

drag(x1, y1, x2, y2, duration=0.3, button='left')

scroll(amount, x=None, y=None, horizontal=False)

Keyboard Operations

type_text(text, interval=None, wpm=None)

hotkey(*keys, interval=0.01)

press(key, presses=1, interval=0.05)

Key Hold Context Manager

Screen Operations

screenshot(region=None, filename=None)

screenshot_to_file(filename, region=None)

get_pixel(x, y)

find_on_screen(image_path, confidence=0.9, grayscale=True)

find_all_on_screen(image_path, confidence=0.9)

wait_for_image(image_path, timeout=10.0, interval=0.5)

Window Operations

get_all_windows()

activate_window(title, partial=True)

get_active_window()

find_window(title_substring)

Clipboard

Utility

🎯 Performance Tips

🛡️ Safety

Failsafe Mode

Approval Mode

🔌 AI Agent (Optional)

📋 Quick Reference

Source Transparency

Related Skills

Harbor Skills

Dynamics Crm

Jira

Generate Education Ad Creative Brief

`move_mouse(x, y, duration=0, smooth=True)`

`move_relative(dx, dy, duration=0)`

`click(x=None, y=None, button='left', clicks=1, interval=0.05)`

`drag(x1, y1, x2, y2, duration=0.3, button='left')`

`scroll(amount, x=None, y=None, horizontal=False)`

`type_text(text, interval=None, wpm=None)`

`hotkey(*keys, interval=0.01)`

`press(key, presses=1, interval=0.05)`

`screenshot(region=None, filename=None)`

`screenshot_to_file(filename, region=None)`

`get_pixel(x, y)`

`find_on_screen(image_path, confidence=0.9, grayscale=True)`

`find_all_on_screen(image_path, confidence=0.9)`

`wait_for_image(image_path, timeout=10.0, interval=0.5)`

`get_all_windows()`

`activate_window(title, partial=True)`

`get_active_window()`

`find_window(title_substring)`