class Segment: def __init__(self,start,end,date,opening_or_closure): self.start = start self.end = end self.date = date self.opening_or_closure = opening_or_closure def toXML(self): segment_element = ET.Element("Segment") segment_element.attrib["Start"]=self.start segment_element.attrib["End"]=self.end segment_element.attrib["Date"]=self.date segment_element.attrib["Opening or closure"]=self.opening_or_closure return(segment_element) #This will work in most cases. For some lines #it does not work. In this case the data #has to be filled in manually def soup_to_segment(soup): columns = soup.findAll("td") date = columns[0].replace(" "," ") start = "" end = "" opening_or_closure = None if(len(columns)==1): #This indicates that the #segment takes up the entire line start = "start" end = "end" else: startToEndStr = columns[1] startToEndStrSplit = startToEndStr.split() opening_or_closure_dutch = startToEndStrSplit[0] if(opening_or_closure_dutch=="Opening"): opening_or_closure="opening" elif(opening_or_closure_dutch="Sluiting"): opening_or_closure="closure" #Read the start and end of the segment reading_start = True for word in fromToStrSplit[1:]: if('-' in word): subWords = word.split('-') for subWord in subWords: if(reading_start): if(start==""): start=subWord else: start+=" "+subWord reading_start=False else: end=subWordsA else: if(reading_start): if(start==""): start=word else: start+=" "+word else: if(end==""): end=word else: end+=" "+word segment = Segment(start,end,date,opening_or_closure) return(segment)