Skip to content

Commit

Permalink
Merge branch 'master' into use-pyproject-toml
Browse files Browse the repository at this point in the history
  • Loading branch information
kha-white committed Jun 21, 2024
2 parents 7a8445c + 965d0f6 commit a614b17
Show file tree
Hide file tree
Showing 20 changed files with 586 additions and 363 deletions.
10 changes: 10 additions & 0 deletions .github/workflows/black.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: Lint

on: [push, pull_request]

jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: psf/black@stable
2 changes: 1 addition & 1 deletion manga_ocr/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,5 @@ def main():
fire.Fire(run)


if __name__ == '__main__':
if __name__ == "__main__":
main()
2 changes: 1 addition & 1 deletion manga_ocr/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.11"
__version__ = "0.1.11"
40 changes: 24 additions & 16 deletions manga_ocr/ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,37 +9,45 @@


class MangaOcr:
def __init__(self, pretrained_model_name_or_path='kha-white/manga-ocr-base', force_cpu=False):
logger.info(f'Loading OCR model from {pretrained_model_name_or_path}')
self.processor = ViTImageProcessor.from_pretrained(pretrained_model_name_or_path)
def __init__(
self, pretrained_model_name_or_path="kha-white/manga-ocr-base", force_cpu=False
):
logger.info(f"Loading OCR model from {pretrained_model_name_or_path}")
self.processor = ViTImageProcessor.from_pretrained(
pretrained_model_name_or_path
)
self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
self.model = VisionEncoderDecoderModel.from_pretrained(pretrained_model_name_or_path)
self.model = VisionEncoderDecoderModel.from_pretrained(
pretrained_model_name_or_path
)

if not force_cpu and torch.cuda.is_available():
logger.info('Using CUDA')
logger.info("Using CUDA")
self.model.cuda()
elif not force_cpu and torch.backends.mps.is_available():
logger.info('Using MPS')
self.model.to('mps')
logger.info("Using MPS")
self.model.to("mps")
else:
logger.info('Using CPU')
logger.info("Using CPU")

example_path = Path(__file__).parent / 'assets/example.jpg'
example_path = Path(__file__).parent / "assets/example.jpg"
if not example_path.is_file():
raise FileNotFoundError(f'Missing example image {example_path}')
raise FileNotFoundError(f"Missing example image {example_path}")
self(example_path)

logger.info('OCR ready')
logger.info("OCR ready")

def __call__(self, img_or_path):
if isinstance(img_or_path, str) or isinstance(img_or_path, Path):
img = Image.open(img_or_path)
elif isinstance(img_or_path, Image.Image):
img = img_or_path
else:
raise ValueError(f'img_or_path must be a path or PIL.Image, instead got: {img_or_path}')
raise ValueError(
f"img_or_path must be a path or PIL.Image, instead got: {img_or_path}"
)

img = img.convert('L').convert('RGB')
img = img.convert("L").convert("RGB")

x = self._preprocess(img)
x = self.model.generate(x[None].to(self.model.device), max_length=300)[0].cpu()
Expand All @@ -53,9 +61,9 @@ def _preprocess(self, img):


def post_process(text):
text = ''.join(text.split())
text = text.replace('…', '...')
text = re.sub('[・.]{2,}', lambda x: (x.end() - x.start()) * '.', text)
text = "".join(text.split())
text = text.replace("…", "...")
text = re.sub("[・.]{2,}", lambda x: (x.end() - x.start()) * ".", text)
text = jaconv.h2z(text, ascii=True, digit=True)

return text
63 changes: 38 additions & 25 deletions manga_ocr/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,30 +27,33 @@ def process_and_write_results(mocr, img_or_path, write_to):
text = mocr(img_or_path)
t1 = time.time()

logger.info(f'Text recognized in {t1 - t0:0.03f} s: {text}')
logger.info(f"Text recognized in {t1 - t0:0.03f} s: {text}")

if write_to == 'clipboard':
if write_to == "clipboard":
pyperclip.copy(text)
else:
write_to = Path(write_to)
if write_to.suffix != '.txt':
raise ValueError('write_to must be either "clipboard" or a path to a text file')
if write_to.suffix != ".txt":
raise ValueError(
'write_to must be either "clipboard" or a path to a text file'
)

with write_to.open('a', encoding="utf-8") as f:
f.write(text + '\n')
with write_to.open("a", encoding="utf-8") as f:
f.write(text + "\n")


def get_path_key(path):
return path, path.lstat().st_mtime


def run(read_from='clipboard',
write_to='clipboard',
pretrained_model_name_or_path='kha-white/manga-ocr-base',
force_cpu=False,
delay_secs=0.1,
verbose=False
):
def run(
read_from="clipboard",
write_to="clipboard",
pretrained_model_name_or_path="kha-white/manga-ocr-base",
force_cpu=False,
delay_secs=0.1,
verbose=False,
):
"""
Run OCR in the background, waiting for new images to appear either in system clipboard, or a directory.
Recognized texts can be either saved to system clipboard, or appended to a text file.
Expand All @@ -65,21 +68,25 @@ def run(read_from='clipboard',

mocr = MangaOcr(pretrained_model_name_or_path, force_cpu)

if sys.platform not in ('darwin', 'win32') and write_to == 'clipboard':
if sys.platform not in ("darwin", "win32") and write_to == "clipboard":
# Check if the system is using Wayland
import os
if os.environ.get('WAYLAND_DISPLAY'):

if os.environ.get("WAYLAND_DISPLAY"):
# Check if the wl-clipboard package is installed
if os.system("which wl-copy > /dev/null") == 0:
pyperclip.set_clipboard("wl-clipboard")
else:
msg = 'Your session uses wayland and does not have wl-clipboard installed. ' \
'Install wl-clipboard for write in clipboard to work.'
msg = (
"Your session uses wayland and does not have wl-clipboard installed. "
"Install wl-clipboard for write in clipboard to work."
)
raise NotImplementedError(msg)

if read_from == 'clipboard':
if read_from == "clipboard":
from PIL import ImageGrab
logger.info('Reading from clipboard')

logger.info("Reading from clipboard")

img = None
while True:
Expand All @@ -95,19 +102,25 @@ def run(read_from='clipboard',
# Pillow error when clipboard contains text (Linux, X11)
pass
else:
logger.warning('Error while reading from clipboard ({})'.format(error))
logger.warning(
"Error while reading from clipboard ({})".format(error)
)
else:
if isinstance(img, Image.Image) and not are_images_identical(img, old_img):
if isinstance(img, Image.Image) and not are_images_identical(
img, old_img
):
process_and_write_results(mocr, img, write_to)

time.sleep(delay_secs)

else:
read_from = Path(read_from)
if not read_from.is_dir():
raise ValueError('read_from must be either "clipboard" or a path to a directory')
raise ValueError(
'read_from must be either "clipboard" or a path to a directory'
)

logger.info(f'Reading from directory {read_from}')
logger.info(f"Reading from directory {read_from}")

old_paths = set()
for path in read_from.iterdir():
Expand All @@ -123,12 +136,12 @@ def run(read_from='clipboard',
img = Image.open(path)
img.load()
except (UnidentifiedImageError, OSError) as e:
logger.warning(f'Error while reading file {path}: {e}')
logger.warning(f"Error while reading file {path}: {e}")
else:
process_and_write_results(mocr, img, write_to)

time.sleep(delay_secs)


if __name__ == '__main__':
if __name__ == "__main__":
fire.Fire(run)
16 changes: 9 additions & 7 deletions manga_ocr_dev/data/generate_backgrounds.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ def find_rectangle(mask, y, x, aspect_ratio_range=(0.33, 3.0)):


def generate_backgrounds(crops_per_page=5, min_size=40):
data = pd.read_csv(MANGA109_ROOT / 'data.csv')
frames_df = pd.read_csv(MANGA109_ROOT / 'frames.csv')
data = pd.read_csv(MANGA109_ROOT / "data.csv")
frames_df = pd.read_csv(MANGA109_ROOT / "frames.csv")

BACKGROUND_DIR.mkdir(parents=True, exist_ok=True)

Expand All @@ -57,11 +57,11 @@ def generate_backgrounds(crops_per_page=5, min_size=40):
page = cv2.imread(str(MANGA109_ROOT / page_path))
mask = np.zeros((page.shape[0], page.shape[1]), dtype=bool)
for row in data[data.page_path == page_path].itertuples():
mask[row.ymin:row.ymax, row.xmin:row.xmax] = True
mask[row.ymin : row.ymax, row.xmin : row.xmax] = True

frames_mask = np.zeros((page.shape[0], page.shape[1]), dtype=bool)
for row in frames_df[frames_df.page_path == page_path].itertuples():
frames_mask[row.ymin:row.ymax, row.xmin:row.xmax] = True
frames_mask[row.ymin : row.ymax, row.xmin : row.xmax] = True

mask = mask | ~frames_mask

Expand All @@ -76,10 +76,12 @@ def generate_backgrounds(crops_per_page=5, min_size=40):
crop = page[ymin:ymax, xmin:xmax]

if crop.shape[0] >= min_size and crop.shape[1] >= min_size:
out_filename = '_'.join(
Path(page_path).with_suffix('').parts[-2:]) + f'_{ymin}_{ymax}_{xmin}_{xmax}.png'
out_filename = (
"_".join(Path(page_path).with_suffix("").parts[-2:])
+ f"_{ymin}_{ymax}_{xmin}_{xmax}.png"
)
cv2.imwrite(str(BACKGROUND_DIR / out_filename), crop)


if __name__ == '__main__':
if __name__ == "__main__":
generate_backgrounds()
Loading

0 comments on commit a614b17

Please sign in to comment.