I am trying to make a program which extracts the text from a PDF document PDF documents contain ARABIC text written by different types of FONT
when I extract the text it works with some files and others not it gives me ambiguous Text
I am using c # and Itext7 to make this program
please show me the methodology to do this with some examples
thank you
data:image/s3,"s3://crabby-images/93b41/93b41d18ec2d4e595176ea9bc145573012535898" alt=""
data:image/s3,"s3://crabby-images/9fe91/9fe91661e7fb768f576ec1f988b9723d0b2625af" alt=""
data:image/s3,"s3://crabby-images/90983/90983034f5d87cfe828f238a8eff6090ef72227b" alt=""
My code :
StringBuilder processed = new StringBuilder();
var src = "d:\\text06.pdf";
var pdfDocument = new PdfDocument(new PdfReader(src));
var strategy = new LocationTextExtractionStrategy();
for (int i = 1; i <= pdfDocument.GetNumberOfPages(); i++)
{
PdfPage page = pdfDocument.GetPage(i);
PdfDictionary fontResources = page.GetResources().GetResource(PdfName.Font);
//foreach (PdfObject font in fontResources.Values(true))
//{
// if (font is PdfDictionary)
// fontResources.Put(PdfName.Encoding, PdfName.IdentityH);
// }
string output = PdfTextExtractor.GetTextFromPage(page);
processed.Append(output);
}
string[] lines = Regex.Split(processed.ToString(), "\n");
List<String> Converted_Lines = new List<string>();
foreach (string s in lines)
{
string converted_string = Inverse(s);
Converted_Lines.Add(converted_string);
}
textBox1.Text = String.Join(Environment.NewLine, Converted_Lines);