it-swarm-fr.com

En train de lire PDF documents en .Net

Existe-t-il une bibliothèque open source qui m’aide à lire/analyser les documents PDF dans .Net/C #?

79
JRoppert

iTextSharp est le meilleur pari. L'utilisé pour faire une araignée pour lucene.Net afin qu'il puisse explorer PDF.

using System;
using System.IO;
using iTextSharp.text.pdf;
using System.Text.RegularExpressions;

namespace Spider.Utils
{
    /// <summary>
    /// Parses a PDF file and extracts the text from it.
    /// </summary>
    public class PDFParser
    {
        /// BT = Beginning of a text object operator 
        /// ET = End of a text object operator
        /// Td move to the start of next line
        ///  5 Ts = superscript
        /// -5 Ts = subscript

        #region Fields

        #region _numberOfCharsToKeep
        /// <summary>
        /// The number of characters to keep, when extracting text.
        /// </summary>
        private static int _numberOfCharsToKeep = 15;
        #endregion

        #endregion

        #region ExtractText
        /// <summary>
        /// Extracts a text from a PDF file.
        /// </summary>
        /// <param name="inFileName">the full path to the pdf file.</param>
        /// <param name="outFileName">the output file name.</param>
        /// <returns>the extracted text</returns>
        public bool ExtractText(string inFileName, string outFileName)
        {
            StreamWriter outFile = null;
            try
            {
                // Create a reader for the given PDF file
                PdfReader reader = new PdfReader(inFileName);
                //outFile = File.CreateText(outFileName);
                outFile = new StreamWriter(outFileName, false, System.Text.Encoding.UTF8);

                Console.Write("Processing: ");

                int totalLen = 68;
                float charUnit = ((float)totalLen) / (float)reader.NumberOfPages;
                int totalWritten = 0;
                float curUnit = 0;

                for (int page = 1; page <= reader.NumberOfPages; page++)
                {
                    outFile.Write(ExtractTextFromPDFBytes(reader.GetPageContent(page)) + " ");

                    // Write the progress.
                    if (charUnit >= 1.0f)
                    {
                        for (int i = 0; i < (int)charUnit; i++)
                        {
                            Console.Write("#");
                            totalWritten++;
                        }
                    }
                    else
                    {
                        curUnit += charUnit;
                        if (curUnit >= 1.0f)
                        {
                            for (int i = 0; i < (int)curUnit; i++)
                            {
                                Console.Write("#");
                                totalWritten++;
                            }
                            curUnit = 0;
                        }

                    }
                }

                if (totalWritten < totalLen)
                {
                    for (int i = 0; i < (totalLen - totalWritten); i++)
                    {
                        Console.Write("#");
                    }
                }
                return true;
            }
            catch
            {
                return false;
            }
            finally
            {
                if (outFile != null) outFile.Close();
            }
        }
        #endregion

        #region ExtractTextFromPDFBytes
        /// <summary>
        /// This method processes an uncompressed Adobe (text) object 
        /// and extracts text.
        /// </summary>
        /// <param name="input">uncompressed</param>
        /// <returns></returns>
        public string ExtractTextFromPDFBytes(byte[] input)
        {
            if (input == null || input.Length == 0) return "";

            try
            {
                string resultString = "";

                // Flag showing if we are we currently inside a text object
                bool inTextObject = false;

                // Flag showing if the next character is literal 
                // e.g. '\\' to get a '\' character or '\(' to get '('
                bool nextLiteral = false;

                // () Bracket nesting level. Text appears inside ()
                int bracketDepth = 0;

                // Keep previous chars to get extract numbers etc.:
                char[] previousCharacters = new char[_numberOfCharsToKeep];
                for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';


                for (int i = 0; i < input.Length; i++)
                {
                    char c = (char)input[i];
                    if (input[i] == 213)
                        c = "'".ToCharArray()[0];

                    if (inTextObject)
                    {
                        // Position the text
                        if (bracketDepth == 0)
                        {
                            if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
                            {
                                resultString += "\n\r";
                            }
                            else
                            {
                                if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
                                {
                                    resultString += "\n";
                                }
                                else
                                {
                                    if (CheckToken(new string[] { "Tj" }, previousCharacters))
                                    {
                                        resultString += " ";
                                    }
                                }
                            }
                        }

                        // End of a text object, also go to a new line.
                        if (bracketDepth == 0 &&
                            CheckToken(new string[] { "ET" }, previousCharacters))
                        {

                            inTextObject = false;
                            resultString += " ";
                        }
                        else
                        {
                            // Start outputting text
                            if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
                            {
                                bracketDepth = 1;
                            }
                            else
                            {
                                // Stop outputting text
                                if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
                                {
                                    bracketDepth = 0;
                                }
                                else
                                {
                                    // Just a normal text character:
                                    if (bracketDepth == 1)
                                    {
                                        // Only print out next character no matter what. 
                                        // Do not interpret.
                                        if (c == '\\' && !nextLiteral)
                                        {
                                            resultString += c.ToString();
                                            nextLiteral = true;
                                        }
                                        else
                                        {
                                            if (((c >= ' ') && (c <= '~')) ||
                                                ((c >= 128) && (c < 255)))
                                            {
                                                resultString += c.ToString();
                                            }

                                            nextLiteral = false;
                                        }
                                    }
                                }
                            }
                        }
                    }

                    // Store the recent characters for 
                    // when we have to go back for a checking
                    for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
                    {
                        previousCharacters[j] = previousCharacters[j + 1];
                    }
                    previousCharacters[_numberOfCharsToKeep - 1] = c;

                    // Start of a text object
                    if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
                    {
                        inTextObject = true;
                    }
                }

                return CleanupContent(resultString);
            }
            catch
            {
                return "";
            }
        }

        private string CleanupContent(string text)
        {
            string[] patterns = { @"\\\(", @"\\\)", @"\\226", @"\\222", @"\\223", @"\\224", @"\\340", @"\\342", @"\\344", @"\\300", @"\\302", @"\\304", @"\\351", @"\\350", @"\\352", @"\\353", @"\\311", @"\\310", @"\\312", @"\\313", @"\\362", @"\\364", @"\\366", @"\\322", @"\\324", @"\\326", @"\\354", @"\\356", @"\\357", @"\\314", @"\\316", @"\\317", @"\\347", @"\\307", @"\\371", @"\\373", @"\\374", @"\\331", @"\\333", @"\\334", @"\\256", @"\\231", @"\\253", @"\\273", @"\\251", @"\\221"};
            string[] replace = {   "(",     ")",      "-",     "'",      "\"",      "\"",    "à",      "â",      "ä",      "À",      "Â",      "Ä",      "é",      "è",      "ê",      "ë",      "É",      "È",      "Ê",      "Ë",      "ò",      "ô",      "ö",      "Ò",      "Ô",      "Ö",      "ì",      "î",      "ï",      "Ì",      "Î",      "Ï",      "ç",      "Ç",      "ù",      "û",      "ü",      "Ù",      "Û",      "Ü",      "®",      "™",      "«",      "»",      "©",      "'" };

            for (int i = 0; i < patterns.Length; i++)
            {
                string regExPattern = patterns[i];
                Regex regex = new Regex(regExPattern, RegexOptions.IgnoreCase);
                text = regex.Replace(text, replace[i]);
            }

            return text;
        }

        #endregion

        #region CheckToken
        /// <summary>
        /// Check if a certain 2 character token just came along (e.g. BT)
        /// </summary>
        /// <param name="tokens">the searched token</param>
        /// <param name="recent">the recent character array</param>
        /// <returns></returns>
        private bool CheckToken(string[] tokens, char[] recent)
        {
            foreach (string token in tokens)
            {
                if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
                    (recent[_numberOfCharsToKeep - 2] == token[1]) &&
                    ((recent[_numberOfCharsToKeep - 1] == ' ') ||
                    (recent[_numberOfCharsToKeep - 1] == 0x0d) ||
                    (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
                    ((recent[_numberOfCharsToKeep - 4] == ' ') ||
                    (recent[_numberOfCharsToKeep - 4] == 0x0d) ||
                    (recent[_numberOfCharsToKeep - 4] == 0x0a))
                    )
                {
                    return true;
                }
            }
            return false;
        }
        #endregion
    }
}
57
ceetheman

Depuis la dernière réponse à cette question en 2008, iTextSharp a considérablement amélioré leur api. Si vous téléchargez la dernière version de leur API à partir de http://sourceforge.net/projects/itextsharp/ , vous pouvez utiliser l'extrait de code suivant pour extraire tout le texte d'un fichier PDF dans une chaîne.

using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;

namespace PdfParser
{
    public static class PdfTextExtractor
    {
        public static string pdfText(string path)
        {
            PdfReader reader = new PdfReader(path);
            string text = string.Empty;
            for(int page = 1; page <= reader.NumberOfPages; page++)
            {
                text += PdfTextExtractor.GetTextFromPage(reader,page);
            }
            reader.Close();
            return text;
        }   
    }
}
92
Brock Nusser

PDFClown pourrait aider, mais je ne le recommanderais pas pour une application volumineuse ou lourde.

5
Ilya Kochetov
public string ReadPdfFile(object Filename, DataTable ReadLibray)
{
    PdfReader reader2 = new PdfReader((string)Filename);
    string strText = string.Empty;

    for (int page = 1; page <= reader2.NumberOfPages; page++)
    {
    ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
    PdfReader reader = new PdfReader((string)Filename);
    String s = PdfTextExtractor.GetTextFromPage(reader, page, its);

    s = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(s)));
    strText = strText + s;
    reader.Close();
    }
    return strText;
}
4
ShravankumarKumar

J'ai utilisé ITextSharp dans le passé pour manipuler/diviser et reformer PDF des documents - c'est assez simple et aussi open-source.

4
Guy Starbuck

iText est la meilleure bibliothèque que je connaisse. Initialement écrit en Java, il existe également un port .NET.

Voir http://www.ujihara.jp/iTextdotNET/en/

3
stonaudr
1
Dobermaxx99

http://www.c-sharpcorner.com/UploadFile/psingh/PDFFileGenerator12062005235236PM/PDFFileGenerator.aspx est à code source ouvert et peut constituer un bon point de départ pour vous.

1
Ben McEvoy

Vous pouvez regarder ceci: http://www.codeproject.com/KB/showcase/pdfrasterizer.aspx Ce n'est pas complètement gratuit, mais ça a l'air très sympa.

Alex

1
Alex Fort

aspose pdf fonctionne plutôt bien. là encore, vous devez payer pour cela

1
Kuvo
0
Cetra

Jetez un coup d'œil à Bibliothèque Docotic.Pdf . Vous n'avez pas besoin de rendre le code source de votre application ouvert (comme iTextSharp avec licence virale AGPL 3, par exemple).

Docotic.Pdf peut être utilisé pour lire des fichiers PDF et extraire du texte avec ou sans mise en forme. S'il vous plaît jeter un oeil à l'exemple qui montre comment extraire du texte à partir de PDF .

Disclaimer: Je travaille pour Bit Miracle, vendeur de la bibliothèque.

0
Bobrovsky