#!/usr/bin/env python3
# cs/devtools/move_cc.py
"""
move_cc.py
A repo-aware, AST-assisted refactoring tool to move files/directories within a C/C++
repository while updating:
- header guards in moved headers
- namespace declarations in moved files
- includes that reference moved headers
- fully-qualified namespace usages (textual, but skipping comments/strings)
- textual path references in BUILD files
Design notes / constraints
- Primary implementation is Python 3.8+ and uses only standard library features by
default. Optionally, if clang.cindex (libclang Python bindings) is available,
the script will use it to confirm/locate namespace declarations more reliably.
- This tool is conservative: it prints a detailed plan of changes before applying
them, and requires `--apply` to actually mutate files. When `--apply` is used,
a clean git working tree is required by default (can be disabled).
- The script attempts to use `git mv` for moves when possible to preserve history.
Limitations / caveats
- A fully correct C++ refactorer is a large effort; this tool aims to be practical
for usual code-bases where directory structure maps to namespaces and include
paths are stable. It may require manual follow-up for complex edge cases.
Usage examples
# dry-run plan
python3 move_cc.py --src cs/foo/bar --dst cs/foo/qux
# actually apply changes (requires clean git tree by default)
python3 move_cc.py --src cs/foo/bar --dst cs/foo/qux --apply --base_dir=/path/to/repo
# move a single file
python3 move_cc.py --src cs/foo/bar/baz.h --dst cs/foo/qux/ --apply
Author: auto-generated; adapt to your repo's conventions as needed.
"""
from __future__ import annotations
import argparse
import os
import re
import shutil
import stat
import subprocess
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Optional, Sequence, Tuple
# Optional clang import (best-effort). If present, libclang will be used to
# sanity-check namespace declarations and to locate declaration locations.
try:
import clang.cindex as clang
except Exception: # pragma: no cover - optional
clang = None
# ----------------------------- Utilities ---------------------------------- #
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def default_header_exts() -> Tuple[str, ...]:
return (".h", ".hh", ".hpp", ".hxx", ".inc")
def default_source_exts() -> Tuple[str, ...]:
return (".c", ".cc", ".cpp", ".cxx", ".m", ".mm")
def is_text_file(path: Path) -> bool:
try:
with path.open("rb") as f:
chunk = f.read(2048)
if b"\0" in chunk:
return False
return True
except Exception:
return False
# ------------------------ Comment/string masking -------------------------- #
def compute_comment_string_mask(text: str) -> List[bool]:
"""Return a list of booleans (len == len(text)) where True indicates the
character is inside a C/C++ comment or string literal.
This is a conservative lexer that understands:
- // line comments
- /* block comments */
- "..." and '...' strings with escapes (no full support for raw-strings)
It's not a full lexer but is sufficient for skipping replacements inside
comments/strings in most C++ code.
"""
n = len(text)
mask = [False] * n
i = 0
state = "normal"
string_quote = None
while i < n:
ch = text[i]
nxt = text[i + 1] if i + 1 < n else ""
if state == "normal":
if ch == "/" and nxt == "/":
# line comment
j = i
mask[j] = True
j += 1
while j < n and text[j] != "\n":
mask[j] = True
j += 1
i = j
continue
if ch == "/" and nxt == "*":
# block comment
j = i
mask[j] = True
j += 1
while j < n:
mask[j] = True
if text[j - 1] == "*" and text[j] == "/":
j += 1
break
j += 1
i = j
continue
if ch in ('"', "'"):
state = "string"
string_quote = ch
mask[i] = True
i += 1
continue
else:
i += 1
continue
elif state == "string":
mask[i] = True
if ch == "\\":
# skip escaped char
if i + 1 < n:
mask[i + 1] = True
i += 2
continue
else:
i += 1
continue
if ch == string_quote:
state = "normal"
string_quote = None
i += 1
continue
return mask
# --------------------------- Header guard helpers ------------------------- #
_guard_token_re = re.compile(r"[^A-Za-z0-9]+")
def path_to_guard(rel: Path) -> str:
s = rel.as_posix()
s = _guard_token_re.sub("_", s)
s = re.sub(r"_+", "_", s)
s = s.strip("_")
s = s.upper()
if s and s[0].isdigit():
s = "_" + s
return s
_RE_IFNDEF = re.compile(r"^\s*#\s*ifndef\s+([A-Za-z_][A-Za-z0-9_]*)\s*$", re.MULTILINE)
_RE_IF_NOT_DEFINED = re.compile(
r"^\s*#\s*if\s*!\s*defined\s*\(?\s*([A-Za-z_][A-Za-z0-9_]*)\s*\)?\s*$", re.MULTILINE
)
def detect_guard(text: str) -> Optional[str]:
# try ifndef/define pair
m = _RE_IFNDEF.search(text)
if m:
name = m.group(1)
# verify there's a corresponding #define shortly after
idx = m.end()
m2 = re.search(
r"^\s*#\s*define\s+" + re.escape(name) + r"\b", text[idx:], re.MULTILINE
)
if m2:
return name
# try #if !defined(NAME)
m = _RE_IF_NOT_DEFINED.search(text)
if m:
name = m.group(1)
idx = m.end()
m2 = re.search(
r"^\s*#\s*define\s+" + re.escape(name) + r"\b", text[idx:], re.MULTILINE
)
if m2:
return name
# pragma once -> no guard
if re.search(r"^\s*#\s*pragma\s+once\b", text, re.MULTILINE):
return None
return None
def rewrite_guard(text: str, old: str, new: str) -> str:
text = re.sub(
rf"^(\s*#\s*ifndef\s+){re.escape(old)}(\b)",
rf"\1{new}\2",
text,
flags=re.MULTILINE,
)
text = re.sub(
rf"^(\s*#\s*if\s*!\s*defined\s*\(?\s*){re.escape(old)}(\s*\)?\s*$)",
rf"\1{new}\2",
text,
flags=re.MULTILINE,
)
text = re.sub(
rf"^(\s*#\s*define\s+){re.escape(old)}(\b)",
rf"\1{new}\2",
text,
flags=re.MULTILINE,
)
text = re.sub(
rf"^(\s*#\s*endif\b[^\n]*?)\b{re.escape(old)}\b",
rf"\1{new}",
text,
flags=re.MULTILINE,
)
return text
# -------------------------- Namespace helpers ---------------------------- #
def path_to_namespace_components(rel: Path, ns_root: Optional[Path]) -> List[str]:
"""Compute namespace components from a repo-relative directory path.
By default, the entire relative directory path is used. If ns_root is set,
the components are relative to ns_root.
"""
p = rel
if p.is_file():
p = p.parent
parts = list(p.parts)
if ns_root:
ns_root = ns_root.parts if isinstance(ns_root, Path) else ns_root
# remove prefix parts matching ns_root if present
if (
isinstance(ns_root, (list, tuple))
and len(ns_root) <= len(parts)
and parts[: len(ns_root)] == list(ns_root)
):
parts = parts[len(ns_root) :]
# sanitize to valid C++ identifiers: replace non-alnum with '_'
def _sanitize(name: str) -> str:
name = re.sub(r"[^A-Za-z0-9_]", "_", name)
if re.match(r"^[0-9]", name):
name = "_" + name
return name
return [_sanitize(p) for p in parts if p]
def find_leading_namespace_block(text: str) -> Optional[Tuple[List[str], int, int]]:
"""Detect a leading namespace block at the top of a file.
Returns (components, start_index, end_index) where components is list of
namespace names (in textual order), and start/end are character indices for
the matched header 'namespace ... { namespace ... { ...' (closing braces are
not validated here). Returns None if no leading namespace pattern is found.
"""
# Find first non-comment non-empty region
# We'll use a simple regex to match either nested-braces namespaces or
# nested-namespace-definition (a::b::c)
ns_single = re.compile(r"^\s*namespace\s+([A-Za-z0-9_:]+)\s*{", re.MULTILINE)
m = ns_single.search(text)
if not m:
return None
start = m.start()
group = m.group(1)
if "::" in group:
comps = [c for c in group.split("::") if c]
end = m.end()
return comps, start, end
# Otherwise, it might be nested 'namespace a { namespace b {'
# We'll greedily match a run of 'namespace NAME {' tokens at the top.
run = re.compile(
r"^(?:\s*(?:/(?:/).*?\n|/\*(?:.|\n)*?\*/\s*)*)((?:\s*namespace\s+[A-Za-z_][A-Za-z0-9_]*\s*\{\s*)+)",
re.MULTILINE,
)
mr = run.search(text)
if not mr:
# fallback to single-match
return [group], m.start(), m.end()
block = mr.group(1)
# extract names
names = re.findall(r"namespace\s+([A-Za-z_][A-Za-z0-9_]*)\s*\{", block)
return names, mr.start(1), mr.end(1)
# ----------------------------- File discovery ---------------------------- #
def iter_repo_files(
root: Path, exts: Sequence[str], exclude_dirs: Sequence[str]
) -> Iterable[Path]:
exts_l = tuple(e.lower() for e in exts)
for p in root.rglob("*"):
if p.is_symlink():
continue
if p.is_file() and p.suffix.lower() in exts_l:
rel = p.relative_to(root)
parts = [s.lower() for s in rel.parts]
if any(ed.rstrip("*") in part for ed in exclude_dirs for part in parts):
continue
yield p
# ----------------------------- Git helpers ------------------------------- #
def git_is_clean(root: Path) -> bool:
try:
result = subprocess.run(
["git", "-C", str(root), "status", "--porcelain"],
capture_output=True,
text=True,
check=True,
)
return result.stdout.strip() == ""
except Exception:
return False
def try_git_mv(src: Path, dst: Path, base_dir: Path) -> bool:
"""Run git mv from src to dst relative to repo root."""
try:
rel_src = src.relative_to(base_dir)
rel_dst = dst.relative_to(base_dir)
subprocess.run(
["git", "mv", str(rel_src), str(rel_dst)],
cwd=base_dir,
check=True,
)
return True
except Exception:
return False
# ----------------------------- Core functions ---------------------------- #
@dataclass
class PlannedChange:
path: Path
orig_text: str
new_text: str
new_path: Path
def plan_moves(root: Path, src: Path, dst: Path) -> Dict[Path, Path]:
"""Return mapping from old absolute paths -> new absolute paths for all files to move."""
src_abs = (root / src).resolve()
dst_abs = (root / dst).resolve()
if not src_abs.exists():
raise SystemExit(f"Source does not exist: {src}")
mapping: Dict[Path, Path] = {}
if src_abs.is_file():
# single file: place into dst directory (if dst ends with slash or is dir) or rename
if dst_abs.exists() and dst_abs.is_dir():
new = dst_abs / src_abs.name
else:
# user provided explicit new filename
new = dst_abs
mapping[src_abs] = new
return mapping
# directory: walk all files inside
for p in src_abs.rglob("*"):
if p.is_file():
rel = p.relative_to(src_abs)
new_path = dst_abs / rel
mapping[p] = new_path
return mapping
def plan_header_guard_rewrites(
root: Path, moves: Dict[Path, Path]
) -> List[PlannedChange]:
changes: List[PlannedChange] = []
for old, new in moves.items():
if old.suffix.lower() in default_header_exts():
orig_text = old.read_text(encoding="utf-8", errors="replace")
old_rel = old.relative_to(root)
new_rel = new.relative_to(root)
old_guard = detect_guard(orig_text)
expected_guard = path_to_guard(new_rel)
if old_guard and old_guard != expected_guard:
new_text = rewrite_guard(orig_text, old_guard, expected_guard)
changes.append(PlannedChange(old, orig_text, new_text, new_path=new))
return changes
def plan_namespace_rewrites_for_moved_files(
root: Path, moves: Dict[Path, Path], ns_root: Optional[Path]
) -> List[PlannedChange]:
changes: List[PlannedChange] = []
for old, new in moves.items():
if old.suffix.lower() in default_header_exts() + default_source_exts() or True:
# inspect file for leading namespace declaration(s)
text = old.read_text(encoding="utf-8", errors="replace")
found = find_leading_namespace_block(text)
if not found:
continue
comps, sidx, eidx = found
# compute expected old/new components from path
old_rel = old.relative_to(root)
new_rel = new.relative_to(root)
old_ns = path_to_namespace_components(old_rel, ns_root)
new_ns = path_to_namespace_components(new_rel, ns_root)
if not old_ns or not new_ns:
continue
# if the file's detected leading namespace matches the path-derived old namespace prefix,
# compose a replacement
# if comps equals old_ns suffix, or startswith old_ns ending, be conservative: only replace when prefix matches
if comps[: len(old_ns)] == old_ns:
# Replace the textual namespace header between sidx and eidx with new namespace
# Build replacement text: either 'namespace a::b::c {' or nested braces
# Preserve whether original used :: or nested-braces style
header_fragment = text[sidx:eidx]
if "::" in header_fragment:
new_fragment = re.sub(
r"namespace\s+[A-Za-z0-9_:]+\s*{",
f"namespace {'::'.join(new_ns)} {{",
header_fragment,
)
else:
# nested braces style: produce same number of 'namespace NAME {' lines
new_fragment = "".join([f"namespace {n} {{\n" for n in new_ns])
new_text = text[:sidx] + new_fragment + text[eidx:]
changes.append(PlannedChange(old, text, new_text, new_path=new))
return changes
def plan_include_and_path_rewrites(
root: Path, moves: Dict[Path, Path], additional_exts: Sequence[str]
) -> List[PlannedChange]:
changes: List[PlannedChange] = []
# Build mapping of repo-relative old -> new posix
rel_map: Dict[str, str] = {}
for old, new in moves.items():
try:
rel_old = old.relative_to(root).as_posix()
rel_new = new.relative_to(root).as_posix()
except Exception:
continue
rel_map[rel_old] = rel_new
# file extensions to scan for includes/usages
exts = list(default_header_exts() + default_source_exts())
exts.extend(list(additional_exts))
for f in iter_repo_files(root, exts, exclude_dirs=[".git", "third_party", "build"]):
text = f.read_text(encoding="utf-8", errors="replace")
orig_text = text
mask = compute_comment_string_mask(text)
out = []
i = 0
changed = False
# Replace include directives
def replace_includes(t: str) -> Tuple[str, bool]:
changed_local = False
for ro, rn in rel_map.items():
# match #include "ro" or #include <ro>
pat = re.compile(r'(#\s*include\s*["<])' + re.escape(ro) + r'([">])')
t2, n = pat.subn(r"\1" + rn + r"\2", t)
if n:
changed_local = True
t = t2
return t, changed_local
text, inc_changed = replace_includes(text)
if inc_changed:
changed = True
# Replace textual references to repo-relative paths in BUILD files or scripts
if f.name == "BUILD" or f.suffix == ".bzl" or f.suffix == ".bazel":
for ro, rn in rel_map.items():
if ro in text:
text = text.replace(ro, rn)
changed = True
# Replace qualified namespace usages: for each old dir -> new dir, attempt to compute namespace
# prefixes and do textual safe replacement of 'old::' -> 'new::' skipping comments/strings
for oldp, newp in rel_map.items():
old_ns = "::".join(path_to_namespace_components(Path(oldp), None))
new_ns = "::".join(path_to_namespace_components(Path(newp), None))
if not old_ns or old_ns == new_ns:
continue
# Search occurrences of old_ns in text and replace only when not inside comments/strings
start = 0
while True:
idx = text.find(old_ns, start)
if idx == -1:
break
# ensure token boundary: previous char is not [A-Za-z0-9_]
if idx > 0 and re.match(r"[A-Za-z0-9_]", text[idx - 1]):
start = idx + len(old_ns)
continue
# ensure following is :: or ::something
after_idx = idx + len(old_ns)
if text[after_idx : after_idx + 2] != "::":
start = after_idx
continue
# ensure not inside comment/string
if mask[idx]:
start = after_idx
continue
# perform replacement
text = text[:idx] + new_ns + text[after_idx:]
# recompute mask for changed region (conservative: recompute whole mask)
mask = compute_comment_string_mask(text)
changed = True
start = idx + len(new_ns)
if changed and text != orig_text:
changes.append(PlannedChange(f, orig_text, text, new_path=newp))
return changes
# ------------------------------ Apply changes ---------------------------- #
def find_repo_root(path: Path) -> Path | None:
try:
result = subprocess.run(
["git", "-C", str(path.parent), "rev-parse", "--show-toplevel"],
capture_output=True,
text=True,
check=True,
)
return Path(result.stdout.strip())
except subprocess.CalledProcessError:
return None
def apply_planned_changes(changes: Iterable[PlannedChange], dry_run: bool) -> int:
"""Apply textual rewrites first, then perform file moves.
Returns:
Number of changes actually applied.
"""
applied = 0
planned_moves: dict[Path, Path] = {}
# 1. Separate moves vs rewrites
for c in changes:
if c.new_path and Path(c.new_path) != Path(c.path):
planned_moves[Path(c.path)] = Path(c.new_path)
# 2. Apply textual changes BEFORE moves
for c in changes:
path = Path(c.path)
# If a file is scheduled for move, still rewrite at the old location
if not path.exists() and path in planned_moves:
path = planned_moves[path]
if not path.exists():
print(f"[WARN] Skipping rewrite, file missing: {path}")
continue
if dry_run:
print(f"[DRY RUN] Would rewrite {path}")
applied += 1
continue
mode = path.stat().st_mode
content = path.read_text()
new_content = c.new_text
if new_content != content:
path.write_text(new_content)
path.chmod(mode)
applied += 1
# 3. Only AFTER rewrites, perform moves
for old, new in planned_moves.items():
if dry_run:
print(f"[DRY RUN] Would move {old} -> {new}")
applied += 1
continue
repo_root = find_repo_root(old)
try:
if repo_root:
subprocess.run(
["git", "mv", str(old), str(new)], check=True, cwd=repo_root
)
else:
new.parent.mkdir(parents=True, exist_ok=True)
shutil.move(str(old), str(new))
applied += 1
except Exception as e:
print(f"[ERROR] Failed to move {old} -> {new}: {e}")
return applied
# ------------------------------- Main ------------------------------------ #
def parse_args(argv: Sequence[str]) -> argparse.Namespace:
p = argparse.ArgumentParser(
description="Move files/dirs and update C++ namespaces/includes/guards."
)
p.add_argument(
"--src",
required=True,
help="Source file or directory (repo-relative or absolute).",
)
p.add_argument(
"--dst",
required=True,
help="Destination path (repo-relative or absolute). For files, this may be a directory or new filename.",
)
p.add_argument(
"--base_dir",
default=".",
help="Repository root (defaults to working directory).",
)
p.add_argument(
"--ns_root",
default=None,
help="Optional namespace root to strip from path->namespace conversion.",
)
p.add_argument(
"--apply",
action="store_true",
help="Apply changes. Otherwise do a dry-run plan.",
)
p.add_argument(
"--no_git_mv", action="store_true", help="Do not use git mv even if available."
)
p.add_argument(
"--extensions",
default=",".join(default_header_exts() + default_source_exts()),
help="Comma-separated list of file extensions to scan.",
)
return p.parse_args(argv)
def main(argv: Sequence[str]) -> int:
args = parse_args(argv)
base = Path(args.base_dir).resolve()
src = Path(args.src)
dst = Path(args.dst)
ns_root = Path(args.ns_root) if args.ns_root else None
print(f"Base dir: {base}")
try:
moves = plan_moves(base, src, dst)
except Exception as e:
eprint(f"Error planning moves: {e}")
return 2
if not moves:
print("No files to move.")
return 0
print("Planned moves:")
for old, new in moves.items():
print(f" {old.relative_to(base)} -> {new.relative_to(base)}")
# header guard changes
guard_changes = plan_header_guard_rewrites(base, moves)
ns_changes = plan_namespace_rewrites_for_moved_files(base, moves, ns_root)
include_and_path_changes = plan_include_and_path_rewrites(
base, moves, additional_exts=[]
) # extend as needed
total_changes = len(guard_changes) + len(ns_changes) + len(include_and_path_changes)
print("\nPlanned internal-file changes:")
print(f" Header guard rewrites: {len(guard_changes)}")
print(f" Namespace rewrites (moved files): {len(ns_changes)}")
print(f" Cross-repo include/path/usage changes: {len(include_and_path_changes)}")
if total_changes == 0:
print("\nNo textual changes required beyond file moves.")
# If applying, require clean git working tree unless user disables git mv
if args.apply:
git_clean = git_is_clean(base)
if not git_clean:
eprint(
"Refusing to apply until git working tree is clean. Commit or stash your changes, or rerun with --no_git_mv to ignore."
)
return 3
# Perform moves (git mv preferred)
if args.apply:
for old, new in moves.items():
new.parent.mkdir(parents=True, exist_ok=True)
moved = False
if not args.no_git_mv and git_is_clean(base):
try:
# attempt git mv; on failure, fallback to shutil.move
if try_git_mv(old, new, base):
print(
f"git mv {old.relative_to(base)} -> {new.relative_to(base)}"
)
moved = True
except Exception:
moved = False
if not moved:
shutil.move(str(old), str(new))
print(f"moved {old.relative_to(base)} -> {new.relative_to(base)}")
else:
print("\nDry-run: not performing file moves. Use --apply to perform them.")
# Apply textual changes
if args.apply:
c1 = apply_planned_changes(
guard_changes + ns_changes + include_and_path_changes,
dry_run=not args.apply,
)
print(f"\nApplied {c1} textual changes.")
else:
print("\nPlanned textual changes (dry-run):")
for c in guard_changes:
print(f" [guard] would rewrite guard in {c.path.relative_to(base)}")
for c in ns_changes:
print(f" [ns] would rewrite namespace in {c.path.relative_to(base)}")
for c in include_and_path_changes:
print(f" [xref] would rewrite {c.path.relative_to(base)}")
print(
"\nDone. Please run your build and tests to validate changes. Expect manual follow-up for some edge-cases."
)
return 0
if __name__ == "__main__":
try:
sys.exit(main(sys.argv[1:]))
except KeyboardInterrupt:
eprint("Interrupted.")
sys.exit(130)
# ------------------------------ BUILD snippet ---------------------------- #
# Add the following to cs/devtools/BUILD:
#
# py_binary(
# name = "move_cc",
# srcs = ["move_cc.py"],
# main = "move_cc.py",
# )
#
# Run under bazel (pass -- to forward args to the program):
#
# bazel run //cs/devtools:move_cc -- --src cs/foo/bar --dst cs/foo/qux --apply --base_dir=$PWD
#
# Note: when running via `bazel run` you must forward --base_dir (or adapt the script to
# autodetect workspace root). The example here forwards $PWD from your shell as the repo root.