@inproceedings{ddf69fcb35554d89a9e2dad6c518df23,
title = "Effective text extraction and recognition for WWW images",
abstract = "Images play a very important role in web content delivery. Many WWW images contain text information that can be used for web indexing and searching. A new text extraction and recognition algorithm is proposed in this paper. The character strokes in the image are first extracted by color clustering and connected component analysis. A novel stroke verification algorithm is used to effectively remove non-character strokes. The verified strokes are then used to build the binary text line image, which is segmented and recognized by dynamic programming. Since text in WWW image usually has close relationship with webpage content, approximate string matching is used to revise the recognition result by matching the content in the webpage with the content in the image. This effective post-processing not only improves the recognition performance, but also can be used in other applications such like image - webpage paragraph corresponding.",
keywords = "Approximate matching, Text extraction, Text recognition",
author = "Jun Sun and Zhulong Wang and Hao Yu and Fumihito Nishino and Yukata Katsuyama and Satoshi Naoi",
year = "2003",
doi = "10.1145/958238.958241",
language = "English",
isbn = "1581137249",
series = "Proceedings of the 2003 ACM Symposium on Document Engineering",
publisher = "Association for Computing Machinery (ACM)",
pages = "115--117",
booktitle = "Proceedings of the 2003 ACM Symposium on Document Engineering",
address = "United States",
note = "Proceedings of the 2003 ACM Symposium on Document Engineering ; Conference date: 20-11-2003 Through 22-11-2003",
}