[Solved] Verify file format first and then retrieve required information in C


Compared to the earlier question, this one isn’t about comparing files for differences with certain exceptions. Here you have a format file that provides the valid tags and order, and then a data file that contains the tags with data. So rather than comparing for differences, you are reading the first to obtain the expected/valid tags, then reading/processing the second to obtain the wanted information.

Below, I also have the code check that the tags in the file appear in the correct order. You can loosen that restriction if you don’t need it. Another bit of logic skips lines less that 3 chars (a valid tag has at least 3 (e.g. <t>)).

The formatting of the output is very simple and you can improve it as needed. I had no data file to work with, so I used the information you provided and created one by duplicating your file above 3-times in a separate file. Look over the code. As other have mentioned, parsing XML in C, while a great assignment, is rarely done in practice because other tools provide readily available tools for handling the schemas. Let me know if you have any questions. This will provide you with one approach to handling this type of information:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define MAXL 128

char *gettag (char *s, char *t);

int main (int argc, char **argv) {

    if (argc < 3 ) {
        fprintf (stderr, "error: insufficient input, usage: %s file1 file2\n",
                argv[0]);
        return 1;
    }

    char s1[MAXL] = {0};                        /* line buffer      */
    char *tags[MAXL] = {0};                     /* file 1 tags      */
    char *retr[] = { "<Team>", "<phone>",       /* skip/print terms */
                    "<rollno>" };
    char *retr1[] = { "</SchoolName>",          /* skip/print once  */
                    "<Team>" };
    char *skip[] = { "<StudentNo>","<hobby>" }; /* skip terms       */
    char *opt[]  = { "<StartDate>",             /* optional tags    */
                    "<ExpiryDate>"};
    size_t retrsz = sizeof retr/sizeof *retr;   /* elements in retr */
    size_t retr1sz = sizeof retr1/sizeof *retr1;/* elements in retr1*/
    size_t skipsz = sizeof skip/sizeof *skip;   /* elements in skip */
    size_t optsz = sizeof opt/sizeof *opt;      /* elements in opt  */
    size_t tidx = 0;                            /* tags indes       */
    size_t idx = 0;                             /* general index    */
    size_t i = 0;                               /* general variable */
    FILE *f1, *f2;                              /* file pointers    */
    unsigned char retvd[retr1sz];               /* retr1 flag       */
    unsigned char tagok = 0;                    /* tag OK flag      */

    /* initialize retr1 VLA values */
    for (i = 0; i < retr1sz; i++)
        retvd[i] = 0;

    /* open both files or exit */
    if (!((f1 = fopen (argv[1], "r")) && (f2 = fopen (argv[2], "r")))) {
        fprintf (stderr, "error: file open failure.\n");
        return 1;
    }

    /* read lines from format file1, create tags array */
    while (fgets (s1, MAXL, f1)) 
    {
        size_t len = strlen (s1);
        while (len && (s1[len-1] == '\n' || s1[len-1] == '\r'))
            s1[--len] = 0;      /* strip newline or carriage return */

        if (len < 3)            /* skip blank, 3 char for valid tag */
            continue;

        char *tmp = NULL;
        if ((tmp = gettag (s1, NULL)) == NULL) {
            fprintf (stderr, "error: tag not found in '%s'", s1);
            return 1;
        }
        tags[tidx++] = tmp;
    }

    fclose (f1);    /* close file1 */

    /* read each line in file2 */
    while (fgets (s1, MAXL, f2))
    {
        char tag[MAXL] = {0};
        size_t len = strlen (s1);

        while (len && (s1[len-1] == '\n' || s1[len-1] == '\r'))
            s1[--len] = 0;      /* strip newline or carriage return */

        if (len < 3)            /* skip blank or lines < 3 chars    */
            goto skipping;

        gettag (s1, tag);
        /* verify that current tag is a valid tag from format file  */
        if (strncmp (tag, tags[idx], strlen (tags[idx])) != 0) {
            tagok = 0;
            for (i = 0; i < tidx; i++) {
                if (strncmp (tag, tags[i], strlen (tags[i])) == 0) {
                    tagok = 1;
                    break;
                }
            }
            if (!tagok) {
                fprintf (stderr, "warning: invalid tag '%s', skipping.\n", tag);
                goto skipping;  /* or handle as desired (e.g. exit) */
            }
        }

        /* check if tag is retr1 and not retvd, if so skip/print    */
        for (i = 0; i < retr1sz; i++)
            if (strncmp (tag, retr1[i], strlen (retr1[i])) == 0) {
                if (!retvd[i]) {        /* print line  skipped      */
                    char *p = strchr (s1, '>'); /* print data       */
                    printf ("%s\n", (p + 1));
                    retvd[i] = 1;       /* set flag to skip next    */
                }
                goto incriment;         /* yes -- it lives....      */
            }

        /* check if tag is a known retr tag, if so skip/print       */
        for (i = 0; i < retrsz; i++)    /* skip if matches skip[i]  */
            if (strncmp (tag, retr[i], strlen (retr[i])) == 0) {
                char *p = strchr (s1, '>');
                printf ("%s\n", (p + 1));      /* print data       */
                goto incriment;
            }

        /* check if tag is a known skip tag, if so skip/print       */
        for (i = 0; i < skipsz; i++)    /* skip if matches skip[i]  */
            if (strncmp (tag, skip[i], strlen (skip[i])) == 0)
                goto incriment;

        /* check if tag matches optional tag, if so skip */
        for (i = 0; i < optsz; i++) {
            if (strncmp (tag, opt[i], strlen (opt[i]) == 0))
                goto incriment;
        }

        incriment:;

        idx++;                  /* increment index  */
        if (idx == tidx)        /* reset if tagsz   */
            idx = 0;

        skipping:;
    }

    fclose (f2);                /* xlose file2      */

    for (i = 0; i < tidx; i++)  /* free tags memory */
        free (tags[i]);

    return 0;
}

/* extract <tag> from s.
* if 't' is NULL, memory is allocated sufficient to hold <tag> + 1
* characters, else <tag> is copied to 't' without allocations.
* On success, the address of 't' is returned, NULL otherwise 
*/
char *gettag (char *s, char *t)
{
    if (!s) return NULL;            /* test valid string        */

    char *p = strchr (s, '>');      /* find first '>' in s      */
    if (!p) return NULL;            /* if no '>', return NULL   */

    size_t len = strlen (s);
    unsigned char nt = 0;
    int tmpc = 0;

    if (len > (size_t)(p - s) + 1) {/* if chars after '>'       */
        tmpc = *(p + 1);            /* save char before term    */
        *(p + 1) = 0;               /* null-terminate at '>'    */
        nt = 1;                     /* set null-terminated flag */
    }

    char *sp = s;
    while (sp < p && *sp != '<')    /* trim space before '<'    */
        sp++;

    if (!t)
        t = strdup (sp);            /* allocate/copy to t       */
    else
        strncpy (t, sp, len + 1);   /* copy w/terminator        */

    if (nt)                         /* if null-terminated       */
        *(p + 1) = tmpc;            /* restore '>' character    */

    return t;
}

File1 – format (list tags)

$ cat dat/student_format.txt
<School>
      </SchoolName>latha2  //skip, but keep
</School>
<Student>
   <Team>power  //skip,but keep
   <StudentNo>1 //skip
       <Sport>
            <StartDate>16122016</StartDate> //*skip(May or maynot contained)
            <SportType>All
            <ExpiryDate>16122020</EndDate> //*skip (May or maynot contained)
       </Sport>

 <Personal>
   <phone>50855466 //skip,but keep
   <rollno>6 //skip,but keep
 </Personal>
 <hobby>  //skip
</Student>

File1 – data file (same as above 3 times)

$ cat dat/student_file.txt
<School>
      </SchoolName>latha2
</School>

<Student>
   <Team>power
   <StudentNo>1
       <Sport>
            <StartDate>16122016</StartDate>
            <SportType>All
            <ExpiryDate>16122020</EndDate>
       </Sport>

 <Personal>
   <phone>50855466
   <rollno>6
 </Personal>
 <hobby>
</Student>
<School>
      </SchoolName>latha2
</School>

<Student>
   <Team>power
   <StudentNo>1
       <Sport>
            <StartDate>16122016</StartDate>
            <SportType>All
            <ExpiryDate>16122020</EndDate>
       </Sport>

 <Personal>
   <phone>50855466
   <rollno>6
 </Personal>
 <hobby>
</Student>
<School>
      </SchoolName>latha2
</School>

<Student>
   <Team>power
   <StudentNo>1
       <Sport>
            <StartDate>16122016</StartDate>
            <SportType>All
            <ExpiryDate>16122020</EndDate>
       </Sport>

 <Personal>
   <phone>50855466
   <rollno>6
 </Personal>
 <hobby>
</Student>

Example Output

$ ./bin/cmpf1f2_2 dat/student_format.txt dat/student_file.txt
latha2
power
50855466
6
50855466
6
50855466
6

9

solved Verify file format first and then retrieve required information in C