Wednesday, 6 November 2024

Fix links within Pdf files when moving from a File share to web hosting

Overview: Build a console to help migrate more than 80k PDF document internal links. The client used a DFS SMB file share to hold index PDFs and multiple documents that needed to be moved to a SharePoint document library.

Hypothesis: Loop through all pdf's in a folder; if there are links, identify the file server links and convert them to web links so they work in the new SharePoint document library.  Various tools were identified as possible solutions but came up short in the migration.  Two good tools are Replace Magic & PDF-XChange Editor.

Resolution: Below is the C# code I wrote to change the links in the VS code.  The debugger was useful as there were many different types of links within the plethora of PDFs.


C# Code

using iText.Kernel.Pdf;
using iText.Kernel.Pdf.Annot;
using iText.Kernel.Pdf.Action;
using iText.Bouncycastle.Crypto;  // pdf fails at runtime periodically without the directive
using System.Text.RegularExpressions;
class Program
{
    static void Main(string[] args)
    {
        string folderPath = @"C:\Users\PaulBeck\Downloads\Software\LinkConvert\ConvertLinksCsharp\"; // Replace with your folder path          
        Console.WriteLine("Please enter a folder Path: e.g. " + folderPath);
        string inputPath = Console.ReadLine();        
        Console.WriteLine("Last Path: e.g. Childfolder2");
        string inputPathVol = Console.ReadLine();
        if (inputPath.Length>5)  {
            folderPath = inputPath;   }
        string[] pdfFiles = Directory.GetFiles(folderPath, "*.pdf");
        foreach (string file in pdfFiles)
        {
            Uri fileUri = new Uri(file);
            string directory = Path.GetDirectoryName(file);
            string filename = Path.GetFileNameWithoutExtension(file);
            string extension = Path.GetExtension(file);           
            string newFilename = $"{filename}_new{extension}";  // Create the new filename
            string newFilePath = Path.Combine(directory, newFilename); // Combine the directory and new filename to form the new URL
            UpdatePdf(fileUri.AbsoluteUri,newFilePath, inputPathVol);
        }
    }
private static void UpdatePdf(string inputFilePath, string outputFilePath, string lastPathPart)
{
    var varInnerName = "";
    PdfDocument pdfDoc = new PdfDocument(new PdfReader(inputFilePath), new PdfWriter(outputFilePath));
        for (int i = 1; i <= pdfDoc.GetNumberOfPages(); i++)        // Iterate through the pages
        {
            var page = pdfDoc.GetPage(i);
            var annotations = page.GetAnnotations();
            foreach (var annotation in annotations)            // Iterate through the annotations
            {
                if (annotation.GetSubtype().Equals(PdfName.Link))
                {
                    var linkAnnotation = (PdfLinkAnnotation)annotation;
                    var action = linkAnnotation.GetAction();                                                        
                    if (action is PdfDictionary dictionary)
                    {
                        foreach (var key in dictionary.KeySet())
                        {
                            var value = dictionary.Get(key);
                            Console.WriteLine($"{key}: {value}");
                            var varPdfNameF = dictionary.Get(PdfName.F);
                            if (varPdfNameF is PdfDictionary varF2Dict)      {
                                varInnerName = varF2Dict.Get(PdfName.F).ToString();  }
                        }
                    }
                    else
                    {   Console.WriteLine("No URL found.");    }  
if (action != null && (action.Get(PdfName.S).Equals(PdfName.GoToR) || action.Get(PdfName.S).Equals(PdfName.Launch)))                
                      {
                        var varF = action.GetAsString(PdfName.F)?.ToString() ?? "";
                        var uri = $"https://radimaging.sharepoint.com/sites/Documents/Standards/{lastPathPart}/{varInnerName}";          
                       string pattern = @"\.\./";
                       if(uri.Contains("../"))    // string cleanUrl1 = Regex.Replace(uri, pattern, string.Empty);
                       {          
                        uri = $"https://radimaging.sharepoint.com/sites/Documents/Standards/{varInnerName}";
                        uri= Regex.Replace(uri, pattern, string.Empty);                       
                       }
                        var newAction = PdfAction.CreateURI(uri);
                        if (varF.Length > 20)      {
                            newAction = PdfAction.CreateURI(varF);     }
                        linkAnnotation.SetAction(newAction);
                    }           
                }
            }
        }
        pdfDoc.Close();
        Console.WriteLine("PDF links updated successfully!");
    }
}



0 comments:

Post a Comment